diff --git a/.bazelrc b/.bazelrc
index ceba7bfdbac74d1e44aadc3010e5e84bd36ce3ee..d5432aeb1757677a3e53bbb76ada2148ca5c23c6 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -25,12 +25,14 @@ build --define framework_shared_object=true
 # If you would like to use a local MKL instead of downloading, please set the
 # environment variable "TF_MKL_ROOT" every time before build.
 build:mkl --define=build_with_mkl=true --define=enable_mkl=true
+build:mkl --define=tensorflow_mkldnn_contraction_kernel=0
 build:mkl -c opt
 
 # This config option is used to enable MKL-DNN open source library only,
 # without depending on MKL binary version.
 build:mkl_open_source_only --define=build_with_mkl_dnn_only=true
 build:mkl_open_source_only --define=build_with_mkl=true --define=enable_mkl=true
+build:mkl_open_source_only --define=tensorflow_mkldnn_contraction_kernel=0
 
 build:download_clang --crosstool_top=@local_config_download_clang//:toolchain
 build:download_clang --define=using_clang=true
@@ -93,9 +95,6 @@ build --define=PREFIX=/usr
 build --define=LIBDIR=$(PREFIX)/lib
 build --define=INCLUDEDIR=$(PREFIX)/include
 
-# Disable MKL-DNN contraction kernels by default.
-build --define=tensorflow_mkldnn_contraction_kernel=0
-
 # Default options should come above this line
 
 # Options from ./configure
diff --git a/RELEASE.md b/RELEASE.md
index 282430d12303bde980e19e3c3602eb91b1a54d63..0a56e6909870e398c9d6349576cd2f8e6734f072 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -849,7 +849,7 @@ answered questions, and were part of inspiring discussions.
 * Remove `tf.contrib.data.Iterator.from_dataset()` method. Use
   `Dataset.make_initializable_iterator()` instead.
 * Remove seldom used and unnecessary `tf.contrib.data.Iterator.dispose_op()`.
-* Reorder some TFGAN loss functions in a non-backwards compatible way.
+* Reorder some TF-GAN loss functions in a non-backwards compatible way.
 
 ## Known Issues
 * In Python 3, `Dataset.from_generator()` does not support Unicode strings.
diff --git a/WORKSPACE b/WORKSPACE
index 2277e83a3f67b62cf4ee1311767ee06c0549c697..1c59686f16c9eb25bf509b216d18628250157319 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -4,11 +4,11 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive", "http_file"
 
 http_archive(
     name = "io_bazel_rules_closure",
-    sha256 = "a38539c5b5c358548e75b44141b4ab637bba7c4dc02b46b1f62a96d6433f56ae",
-    strip_prefix = "rules_closure-dbb96841cc0a5fb2664c37822803b06dab20c7d1",
+    sha256 = "43c9b882fa921923bcba764453f4058d102bece35a37c9f6383c713004aacff1",
+    strip_prefix = "rules_closure-9889e2348259a5aad7e805547c1a0cf311cfcd91",
     urls = [
-        "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/dbb96841cc0a5fb2664c37822803b06dab20c7d1.tar.gz",
-        "https://github.com/bazelbuild/rules_closure/archive/dbb96841cc0a5fb2664c37822803b06dab20c7d1.tar.gz",  # 2018-04-13
+        "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/9889e2348259a5aad7e805547c1a0cf311cfcd91.tar.gz",
+        "https://github.com/bazelbuild/rules_closure/archive/9889e2348259a5aad7e805547c1a0cf311cfcd91.tar.gz",  # 2018-12-21
     ],
 )
 
diff --git a/configure.py b/configure.py
index c588381d40cce95df030532c4131616e4bf16c08..bb59063f7932dd1f74e12a39db029a86268a7c87 100644
--- a/configure.py
+++ b/configure.py
@@ -33,7 +33,7 @@ except ImportError:
   from distutils.spawn import find_executable as which
 # pylint: enable=g-import-not-at-top
 
-_DEFAULT_CUDA_VERSION = '9.0'
+_DEFAULT_CUDA_VERSION = '10.0'
 _DEFAULT_CUDNN_VERSION = '7'
 _DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,7.0'
 _DEFAULT_CUDA_PATH = '/usr/local/cuda'
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index f07e7365d3482cde5b7bb76ebf22890150e98651..29d71c323ab5ee860ebf48c332cfd7f607f3f0c3 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -370,13 +370,21 @@ config_setting(
     define_values = {"tf_api_version": "2"},
 )
 
+# This flag is defined for select statements that match both
+# on 'windows' and 'api_version_2'. In this case, bazel requires
+# having a flag which is a superset of these two.
+config_setting(
+    name = "windows_and_api_version_2",
+    define_values = {"tf_api_version": "2"},
+    values = {"cpu": "x64_windows"},
+)
+
 package_group(
     name = "internal",
     packages = [
         "-//third_party/tensorflow/python/estimator",
         "//learning/deepmind/...",
         "//learning/meta_rank/...",
-        "//learning/pathways/...",  # While dataset C++ api requires internals
         "//tensorflow/...",
         "//tensorflow_estimator/contrib/...",
         "//tensorflow_fold/llgtm/...",
diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py
index 2c0a7452692e5cdb184f7f0a77eb1b646a1772d4..a93799bfe84b0f9c4743e1ad0effd6e69ad7f3f2 100644
--- a/tensorflow/api_template.__init__.py
+++ b/tensorflow/api_template.__init__.py
@@ -52,7 +52,7 @@ elif _tf_api_dir not in __path__:
   __path__.append(_tf_api_dir)
 
 # Enable TF2 behaviors
-from tensorflow.python.compat import compat as _compat  # pylint: disable=g-import-not-at-top
+from tensorflow.python.compat import v2_compat as _compat  # pylint: disable=g-import-not-at-top
 _compat.enable_v2_behavior()
 
 
diff --git a/tensorflow/api_template_v1.__init__.py b/tensorflow/api_template_v1.__init__.py
index 514aba1b59631f882523396aab0f4d3d5e88a893..b293d0f1534d2017efa9bfed655fa6eadc5d88de 100644
--- a/tensorflow/api_template_v1.__init__.py
+++ b/tensorflow/api_template_v1.__init__.py
@@ -62,7 +62,8 @@ if '__all__' in vars():
   vars()['__all__'].append('contrib')
 
 from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
-app.flags = flags  # pylint: disable=undefined-variable
+from tensorflow.python.platform import app  # pylint: disable=g-import-not-at-top
+app.flags = flags
 
 # Make sure directory containing top level submodules is in
 # the __path__ so that "from tensorflow.foo import bar" works.
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 9580215a317b1a6b1cdacbd430a1764af61be990..9f2f83920cc73028fd2372afaf303e8b1c1c64f9 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -2881,6 +2881,9 @@ const char* TF_ServerTarget(TF_Server* server) {
 #endif
 }
 
-void TF_DeleteServer(TF_Server* server) { delete server; }
-
+void TF_DeleteServer(TF_Server* server) {
+#ifndef __ANDROID__
+  delete server;
+#endif
+}
 }  // end extern "C"
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 9f09ad1fc307ed96f327df289c0a45bf79325d7e..3ea1724d1e6ac4135ef6883e2bba3e56ee401a41 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -3,11 +3,10 @@ licenses(["notice"])  # Apache 2.0
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_cuda_cc_test",
-    "tf_cc_test",
     "tf_copts",
-    "tfe_xla_copts",
+    "tf_cuda_cc_test",
     "tf_cuda_library",
+    "tfe_xla_copts",
 )
 
 tf_cuda_library(
@@ -29,6 +28,7 @@ tf_cuda_library(
             "//tensorflow/c:c_api_internal",
             "//tensorflow/core:core_cpu",
             "//tensorflow/core/common_runtime/eager:attr_builder",
+            "//tensorflow/core/common_runtime/eager:profiler",
             "//tensorflow/core/common_runtime/eager:context",
             "//tensorflow/core/common_runtime/eager:eager_executor",
             "//tensorflow/core/common_runtime/eager:execute",
@@ -89,6 +89,7 @@ tf_cuda_library(
         "//tensorflow/core/common_runtime/eager:eager_executor",
         "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core/common_runtime/eager:kernel_and_device",
+        "//tensorflow/core/common_runtime/eager:profiler",
         "//tensorflow/core/common_runtime/eager:tensor_handle",
         "//tensorflow/core/distributed_runtime:remote_device",
         "//tensorflow/core/distributed_runtime:server_lib",
@@ -204,6 +205,31 @@ tf_cuda_library(
     ],
 )
 
+tf_cuda_cc_test(
+    name = "c_api_experimental_test",
+    size = "small",
+    srcs = [
+        "c_api_experimental_test.cc",
+    ],
+    extra_copts = tfe_xla_copts(),
+    tags = [
+        "guitar",
+        "multi_gpu",
+    ],
+    deps = [
+        ":c_api_experimental",
+        ":c_api_test_util",
+        "//tensorflow/c:c_test_util",
+        "//tensorflow/cc/profiler",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler:protos_all_cc",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "tape",
     hdrs = ["tape.h"],
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 027d752f420238da867cb9d8c116640e1730caaa..d5a391a98d2bdc6f80858c6673aa2ce2fac6f49a 100755
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -774,7 +774,7 @@ void TFE_ContextExportRunMetadata(TFE_Context* ctx, TF_Buffer* buf,
   if (!status->status.ok()) return;
   tensorflow::mutex_lock ml(*ctx->context.MetadataMu());
   status->status = MessageToBuffer(*ctx->context.RunMetadataProto(), buf);
-  ctx->context.RunMetadataProto()->Clear();
+  ctx->context.ClearRunMetadata();
 }
 
 namespace {
diff --git a/tensorflow/c/eager/c_api_debug.cc b/tensorflow/c/eager/c_api_debug.cc
index 52b0824552855860dfb138f3ac9a5d3afa7dc965..ffcd5ace0b98597363abe63201bf6c328a03212f 100644
--- a/tensorflow/c/eager/c_api_debug.cc
+++ b/tensorflow/c/eager/c_api_debug.cc
@@ -83,7 +83,7 @@ TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
       }
     }
 
-    if (xla::ShapeUtil::IsTuple(padded_shape)) {
+    if (padded_shape.IsTuple()) {
       if (xla::ShapeUtil::TupleElementCount(padded_shape) != 2) {
         // Currently, the only case of XlaTensor containing a tuple shape is to
         // represent 64 bit ints, doubles, and complex numbers (we don't support
@@ -99,7 +99,7 @@ TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
       xla::Shape shape0 = xla::ShapeUtil::GetTupleElementShape(padded_shape, 0);
       const xla::Shape& shape1 =
           xla::ShapeUtil::GetTupleElementShape(padded_shape, 1);
-      if (xla::ShapeUtil::IsTuple(shape0) || xla::ShapeUtil::IsTuple(shape1)) {
+      if (shape0.IsTuple() || shape1.IsTuple()) {
         status->status = tensorflow::errors::InvalidArgument(
             "XlaTensors should not contain nested tuples. Shape: ",
             padded_shape.DebugString());
diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
index 3461d81b93594021b4e977aa60ef5d1f92e1fc5c..1ce03fb22693960627c27cd4aec58106a9ff3218 100644
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -18,6 +18,29 @@ limitations under the License.
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/eager/c_api_internal.h"
 
+using tensorflow::string;
+
 void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
   op->operation.ConsumeInput(h->handle);
 }
+
+TFE_Profiler* TFE_NewProfiler(TFE_Context* ctx) {
+  return new TFE_Profiler(ctx);
+}
+
+void TFE_DeleteProfiler(TFE_Profiler* profiler) { delete profiler; }
+
+void TFE_ProfilerSerializeToString(TFE_Context* ctx, TFE_Profiler* profiler,
+                                   TF_Buffer* buf, TF_Status* status) {
+  TFE_ContextAsyncWait(ctx, status);
+  if (!status->status.ok()) return;
+  string content;
+  status->status = profiler->profiler->SerializeToString(&content);
+  void* data = tensorflow::port::Malloc(content.length());
+  content.copy(static_cast<char*>(data), content.length(), 0);
+  buf->data = data;
+  buf->length = content.length();
+  buf->data_deallocator = [](void* data, size_t length) {
+    tensorflow::port::Free(data);
+  };
+}
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
index 4ee6c066eef7f60fd2f50a197c4dc20ed9c9f927..9eb80f521624e0116dd8ea5e4dbbf7e3d350a09c 100644
--- a/tensorflow/c/eager/c_api_experimental.h
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -25,6 +25,24 @@ extern "C" {
 TF_CAPI_EXPORT extern void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h,
                                               TF_Status* status);
 
+// A profiler which will start profiling when creating the object and will stop
+// when the object is destroyed. It will profile all operations run under the
+// given TFE_Context. Multiple instance of it can be created, but at most one
+// of them will profile for each TFE_Context.
+// Thread-safety: TFE_Profiler is thread-safe.
+typedef struct TFE_Profiler TFE_Profiler;
+
+TF_CAPI_EXPORT extern TFE_Profiler* TFE_NewProfiler(TFE_Context* ctx);
+TF_CAPI_EXPORT extern void TFE_DeleteProfiler(TFE_Profiler* profiler);
+
+// The output string is a binary string of tensorflow.tfprof.ProfileProto.
+// User can write the string to file for offline analysis by tfprof command-line
+// tools or graphical user interface.
+TF_CAPI_EXPORT extern void TFE_ProfilerSerializeToString(TFE_Context* ctx,
+                                                         TFE_Profiler* profiler,
+                                                         TF_Buffer* buf,
+                                                         TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/eager/c_api_experimental_test.cc b/tensorflow/c/eager/c_api_experimental_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9b4fca9d45af2d72e1c21e711b60636ab3f39714
--- /dev/null
+++ b/tensorflow/c/eager/c_api_experimental_test.cc
@@ -0,0 +1,79 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/eager/c_api_experimental.h"
+
+#include <string.h>
+#include "absl/strings/match.h"
+#include "tensorflow/c/eager/c_api_test_util.h"
+#include "tensorflow/cc/profiler/profiler.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/profiler/tfprof_log.pb.h"
+
+using tensorflow::string;
+
+namespace {
+
+void ExecuteWithProfiling(bool async) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  TFE_Profiler* profiler = TFE_NewProfiler(ctx);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+  TFE_Op* matmul = MatMulOp(ctx, m, m);
+  TFE_TensorHandle* retvals[1] = {nullptr};
+  int num_retvals = 1;
+  TFE_Execute(matmul, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteOp(matmul);
+  TFE_DeleteTensorHandle(m);
+
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  ASSERT_EQ(1, num_retvals);
+  TF_Buffer* profiler_result = TF_NewBuffer();
+  TFE_ProfilerSerializeToString(ctx, profiler, profiler_result, status);
+  TFE_DeleteProfiler(profiler);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  tensorflow::tfprof::ProfileProto profile_proto;
+  EXPECT_TRUE(profile_proto.ParseFromString(
+      {reinterpret_cast<const char*>(profiler_result->data),
+       profiler_result->length}));
+  TF_DeleteBuffer(profiler_result);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+  TFE_DeleteTensorHandle(retvals[0]);
+  TFE_DeleteContext(ctx);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  float product[4] = {0};
+  EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
+  memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(7, product[0]);
+  EXPECT_EQ(10, product[1]);
+  EXPECT_EQ(15, product[2]);
+  EXPECT_EQ(22, product[3]);
+  TF_DeleteStatus(status);
+}
+TEST(CAPI, ExecuteWithTracing) { ExecuteWithProfiling(false); }
+TEST(CAPI, ExecuteWithTracingAsync) { ExecuteWithProfiling(true); }
+
+}  // namespace
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index 67bc1bcd24605f8363d6a7c8d5d6a0836a42fc82..7b1035f631ee4ca5283da93696d619dcf7b6deba 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/eager/profiler.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
@@ -100,6 +101,13 @@ struct TFE_Op {
   tensorflow::EagerOperation operation;
 };
 
+struct TFE_Profiler {
+  TFE_Profiler(TFE_Context* ctx)
+      : profiler(tensorflow::EagerProfiler::Create(&ctx->context)) {}
+
+  std::unique_ptr<tensorflow::EagerProfiler> profiler;
+};
+
 namespace tensorflow {
 // Set an AttrValue on the op. Doesn't handle the list types.
 void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op,
diff --git a/tensorflow/c/kernels.cc b/tensorflow/c/kernels.cc
index 2a4eaecb6cf2740a522b1e849d1306ebde6c4577..673aed558ab2588d2dd1e463c836082d27ef0777 100644
--- a/tensorflow/c/kernels.cc
+++ b/tensorflow/c/kernels.cc
@@ -158,3 +158,12 @@ void TF_SetOutput(TF_OpKernelContext* ctx, int i, const TF_Tensor* tensor,
     cc_ctx->set_output(i, cc_tensor);
   }
 }
+
+TF_DataType TF_ExpectedOutputDataType(TF_OpKernelContext* ctx, int i) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
+  return static_cast<TF_DataType>(cc_ctx->expected_output_dtype(i));
+}
+
+int64_t TF_StepId(TF_OpKernelContext* ctx) {
+  return reinterpret_cast<::tensorflow::OpKernelContext*>(ctx)->step_id();
+}
diff --git a/tensorflow/c/kernels.h b/tensorflow/c/kernels.h
index cefc30bcdf89bdc14a4406299cc29f74153e77ac..721a4aca0be1a23a6764d476578ab4a26382570d 100644
--- a/tensorflow/c/kernels.h
+++ b/tensorflow/c/kernels.h
@@ -111,6 +111,14 @@ TF_CAPI_EXPORT extern void TF_SetOutput(TF_OpKernelContext* ctx, int i,
                                         const TF_Tensor* tensor,
                                         TF_Status* status);
 
+// Returns the expected output data type of the ith output. If i < 0 or
+// i >= TF_NumOutputs(ctx), the program aborts.
+TF_CAPI_EXPORT extern TF_DataType TF_ExpectedOutputDataType(
+    TF_OpKernelContext* ctx, int i);
+
+// Returns the step ID of the given context.
+TF_CAPI_EXPORT extern int64_t TF_StepId(TF_OpKernelContext* ctx);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/kernels_test.cc b/tensorflow/c/kernels_test.cc
index e659ee3c3d258a626ccf03a782ec031b5a703a48..fdeb04c84c99912a9a1f898a7e91391901a6af7d 100644
--- a/tensorflow/c/kernels_test.cc
+++ b/tensorflow/c/kernels_test.cc
@@ -41,6 +41,9 @@ static void* MyCreateFunc(TF_OpKernelConstruction* ctx) {
 static void MyComputeFunc(void* kernel, TF_OpKernelContext* ctx) {
   struct MyCustomKernel* s = static_cast<struct MyCustomKernel*>(kernel);
   s->compute_called = true;
+  if (ctx != nullptr) {
+    EXPECT_EQ(43, TF_StepId(ctx));
+  }
 }
 
 static void MyDeleteFunc(void* kernel) {
@@ -155,6 +158,8 @@ TEST(TestKernel, TestInputAndOutputCount) {
     TF_SetOutput(ctx, 24, input, s);
     EXPECT_EQ(TF_OUT_OF_RANGE, TF_GetCode(s));
 
+    EXPECT_EQ(TF_UINT8, TF_ExpectedOutputDataType(ctx, 0));
+
     TF_DeleteStatus(s);
     if (input != nullptr) {
       TF_DeleteTensor(input);
@@ -175,6 +180,7 @@ TEST(TestKernel, TestInputAndOutputCount) {
     OpKernelContext::Params p;
     DummyDevice dummy_device(nullptr, false);
     p.device = &dummy_device;
+    p.step_id = 43;
 
     Tensor t(tensorflow::uint8(123));
 
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index b9a87ba296abfc6b9d9aaeff3b3e26678e4e1b94..ba9fa93654f62b9636eafc53865c7b572307e695 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -634,6 +634,7 @@ tf_cc_test(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index 8617beec004d0fe912155f054442c5b6249bb6b5..1f8ec09e19c01d0a8b2a3761135ed53dfb2ad3b0 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -25,6 +25,8 @@ limitations under the License.
 #include "tensorflow/compiler/jit/encapsulate_util.h"
 #include "tensorflow/compiler/jit/extract_outside_compilation_pass.h"
 #include "tensorflow/compiler/tf2xla/side_effect_util.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -32,6 +34,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
@@ -513,6 +517,18 @@ Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library,
   s = PerformStaticShapeInferenceBeforeEncapsulation(graph.get());
   if (!s.ok()) return s;
 
+  // Create FunctionLibraryRuntime.
+  SessionOptions session_options;
+  std::vector<std::unique_ptr<Device>> devices;
+  TF_CHECK_OK(DeviceFactory::AddDevices(
+      session_options, "/job:localhost/replica:0/task:0", &devices));
+  OptimizerOptions opts;
+  auto device_mgr = absl::make_unique<DeviceMgr>(std::move(devices));
+  auto pflr = absl::make_unique<ProcessFunctionLibraryRuntime>(
+      device_mgr.get(), Env::Default(), TF_GRAPH_DEF_VERSION, lib_def.get(),
+      opts, /*default_thread_pool=*/nullptr, /*cluster_flr=*/nullptr);
+  auto flr = pflr->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
+
   std::unique_ptr<Graph> graph_out;
   s = EncapsulateSubgraphsInFunctions(
       "_encapsulate", /*outside_compilation_attribute=*/"", *graph,
@@ -538,7 +554,7 @@ Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library,
                                     std::map<string, int>{}});
   }
   s = ExtractOutsideCompilation("_encapsulate", "_outside", clusters,
-                                graph_out.get(), lib_def.get());
+                                graph_out.get(), flr, lib_def.get());
   if (!s.ok()) return s;
 
   GraphDef graphdef_out;
@@ -941,7 +957,9 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", shape_inference_graph},
             {"shapes", absl::Span<const DataType>({})},
-            {"_outside_compilation_subgraph", "O1"}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}},
            {"c"}},
       },
       {{"f_0_retval_retval", "F:o:0"}});
@@ -1101,7 +1119,9 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
             {"key", "host_compute_channel_F1_O2"},
             {"shape_inference_graph", shape_inference_graph2},
             {"shapes", absl::Span<const DataType>({})},
-            {"_outside_compilation_subgraph", "O2"}},
+            {"_outside_compilation_subgraph", "O2"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}},
            {"F"}},
           {{"outside_compilation_O1_host_compute"},
            "XlaHostCompute",
@@ -1112,7 +1132,9 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", shape_inference_graph1},
             {"shapes", absl::Span<const DataType>({})},
-            {"_outside_compilation_subgraph", "O1"}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}},
            {"D"}},
       },
       {{"g_0_retval_retval", "outside_compilation_O2_host_compute:outputs:0"},
@@ -1244,7 +1266,9 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
             {"shape_inference_graph", NameAttrList()},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
-            {"_outside_compilation_subgraph", "O1"}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}},
            {"D"}},
       },
       {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
@@ -1269,7 +1293,9 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
             {"shape_inference_graph", NameAttrList()},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
-            {"_outside_compilation_subgraph", "O1"}}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
       },
       {{"g_0_retval_retval", "G:o:0"}, {"i_0_retval_retval", "I:o:0"}});
 
@@ -1397,7 +1423,9 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
             {"shape_inference_graph", NameAttrList()},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
-            {"_outside_compilation_subgraph", "O1"}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}},
            {"D"}},
       },
       {{"f_0_retval_retval", "F:o:0"}});
@@ -1419,7 +1447,9 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
             {"shape_inference_graph", NameAttrList()},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
-            {"_outside_compilation_subgraph", "O1"}}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
       },
       {{"i_0_retval_retval", "I:o:0"}});
 
@@ -1527,7 +1557,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
             {"shape_inference_graph", NameAttrList()},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
-            {"_outside_compilation_subgraph", "O1"}}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
       },
       {{"f_0_retval_retval", "F:o:0"}});
 
@@ -1615,7 +1647,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
             {"shape_inference_graph", NameAttrList()},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
-            {"_outside_compilation_subgraph", "O1"}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}},
            {"D"}},
       },
       {{"f_0_retval_retval", "F:o:0"}});
@@ -1716,7 +1750,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", shape_inference_graph},
             {"shapes", absl::Span<const TensorShapeProto>({})},
-            {"_outside_compilation_subgraph", "O1"}}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
       },
       {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
        {"f_0_retval_retval", "F:o:0"}});
@@ -1821,7 +1857,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", shape_inference_graph},
             {"shapes", absl::Span<const TensorShapeProto>({})},
-            {"_outside_compilation_subgraph", "O1"}}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
       },
       {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
        {"f_0_retval_retval", "F:o:0"}});
@@ -1949,7 +1987,9 @@ TEST(EncapsulateSubgraphsTest,
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", shape_inference_graph1},
             {"shapes", absl::Span<const TensorShapeProto>({})},
-            {"_outside_compilation_subgraph", "O1"}}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
           {{"outside_compilation_O2_host_compute"},
            "XlaHostCompute",
            {"F:o:0"},
@@ -1959,7 +1999,9 @@ TEST(EncapsulateSubgraphsTest,
             {"key", "host_compute_channel_F1_O2"},
             {"shape_inference_graph", shape_inference_graph2},
             {"shapes", absl::Span<const TensorShapeProto>({})},
-            {"_outside_compilation_subgraph", "O2"}}},
+            {"_outside_compilation_subgraph", "O2"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
       },
       {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
        {"h_0_retval_retval", "H:o:0"}});
@@ -2082,7 +2124,9 @@ TEST(EncapsulateSubgraphsTest,
             {"key", "host_compute_channel_F1_O2"},
             {"shape_inference_graph", NameAttrList()},
             {"shapes", absl::Span<const TensorShapeProto>({})},
-            {"_outside_compilation_subgraph", "O2"}}},
+            {"_outside_compilation_subgraph", "O2"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
           {{"outside_compilation_O1_host_compute"},
            "XlaHostCompute",
            {"D:o:0"},
@@ -2092,7 +2136,9 @@ TEST(EncapsulateSubgraphsTest,
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", shape_inference_graph},
             {"shapes", absl::Span<const TensorShapeProto>({})},
-            {"_outside_compilation_subgraph", "O1"}}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
       },
       {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
        {"h_0_retval_retval", "H:o:0"}});
@@ -2214,7 +2260,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
          {"key", "host_compute_channel_F1_O1"},
          {"shape_inference_graph", shape_inference_graph},
          {"shapes", absl::Span<const TensorShapeProto>({})},
-         {"_outside_compilation_subgraph", "O1"}}},
+         {"_outside_compilation_subgraph", "O1"},
+         {"_xla_token_input_nodes",
+          absl::Span<const string>({"_xla_token_arg_node"})}}},
        {{"outside_compilation_O2_host_compute"},
         "XlaHostCompute",
         {"D:o:0"},
@@ -2224,7 +2272,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
          {"key", "host_compute_channel_F1_O2"},
          {"shape_inference_graph", NameAttrList()},
          {"shapes", absl::Span<const TensorShapeProto>({})},
-         {"_outside_compilation_subgraph", "O2"}},
+         {"_outside_compilation_subgraph", "O2"},
+         {"_xla_token_input_nodes",
+          absl::Span<const string>({"_xla_token_arg_node"})}},
         {}},
        {{"outside_compilation_O3_host_compute"},
         "XlaHostCompute",
@@ -2235,7 +2285,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
          {"key", "host_compute_channel_F1_O3"},
          {"shape_inference_graph", NameAttrList()},
          {"shapes", absl::Span<const TensorShapeProto>({})},
-         {"_outside_compilation_subgraph", "O3"}},
+         {"_outside_compilation_subgraph", "O3"},
+         {"_xla_token_input_nodes",
+          absl::Span<const string>({"_xla_token_arg_node"})}},
         {}}},
       {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
        {"h_0_retval_retval", "H:o:0"}});
@@ -2354,7 +2406,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) {
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", shape_inference_graph},
             {"shapes", absl::Span<const TensorShapeProto>({})},
-            {"_outside_compilation_subgraph", "O1"}}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
       },
       {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
        {"f_0_retval_retval", "F:o:0"}});
@@ -2465,7 +2519,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", shape_inference_graph},
             {"shapes", absl::Span<const DataType>({})},
-            {"_outside_compilation_subgraph", "O1"}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}},
            {"c"}},
       },
       {{"f_0_retval_retval", "F:o:0"}});
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
index 8b01768c49422b331b52a8ba31bade000c95722e..2a770c527b2fae91352fd17dacb13495a3a73f34 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 
 namespace tensorflow {
 
@@ -308,6 +309,10 @@ xla::StatusOr<NodeDef> BuildXlaHostComputeNodeDef(
     host_compute_builder.Attr("tpu_core", core);
   }
 
+  // Set input tokens.
+  host_compute_builder.Attr(kXlaTokenInputNodesAttrName,
+                            std::vector<string>{kXlaTokenArgNodeName});
+
   // Populate inputs.
   std::vector<DataType> input_dtypes;
   TF_RETURN_IF_ERROR(GetNodeAttr(call_node->attrs(), "Tinputs", &input_dtypes));
@@ -398,8 +403,8 @@ Status ReplaceOrRemoveOutsideCompilationCallNode(
 }
 
 // Resets "device_ordinal" attr to placeholder value for related nodes
-// (XlaRecvAtHost nodes; XlaSendFromHost nodes; If nodes containing
-// XlaRecvAtHost/XlaSendFromHost).
+// (XlaRecvAtHost nodes; XlaSendFromHost nodes; If/While/FuncCall nodes
+// containing XlaRecvAtHost/XlaSendFromHost).
 Status ResetDeviceOrdinalToPlaceholderValue(Graph* g) {
   AttrValue device_ordinal_value;
   device_ordinal_value.set_placeholder("device_ordinal");
@@ -429,6 +434,10 @@ Status ResetDeviceOrdinalToPlaceholderValue(Graph* g) {
         n->ClearAttr(attr_name);
         n->AddAttr(attr_name, branch_func);
       }
+    } else if (HasNodeAttr(n->def(), "device_ordinal")) {
+      // Function call node containing outside compilation.
+      n->ClearAttr("device_ordinal");
+      n->AddAttr("device_ordinal", device_ordinal_value);
     } else {
       return errors::Internal("Unknown node marked with ",
                               kXlaHasHostTransferAttrName, ": ",
@@ -1217,20 +1226,129 @@ Status BuildHostGraphForWhileNode(
   return Status::OK();
 }
 
+// Builds host graph for func call nodes.
+Status BuildHostGraphForFuncCallNode(const string& func_call_node_name,
+                                     const string& xla_cluster_name,
+                                     const string& func_call_host_func_name,
+                                     const string& host_graph_func_name,
+                                     FunctionLibraryDefinition* fld) {
+  Graph host_graph(fld);
+  AttrValue device_ordinal_value;
+  device_ordinal_value.set_placeholder("device_ordinal");
+
+  // Step 1: add key placeholder node.
+  TF_ASSIGN_OR_RETURN(
+      Node * key_placeholder,
+      AddHostComputeKeyPlaceholder(xla_cluster_name, &host_graph));
+
+  // Step 2: rewrite `host_func_name`, replace key placeholder with an _Arg
+  // node.
+  TF_RETURN_IF_ERROR(ReplaceKeyPlaceholderWithArgNode(
+      xla_cluster_name, func_call_host_func_name, fld));
+
+  // Step 3: build a function call node with `host_func_name`, with
+  // `key_placeholder` as input.
+  NodeDefBuilder call_builder(absl::StrCat("oc_call_", func_call_node_name),
+                              func_call_host_func_name, fld);
+  call_builder.Input(key_placeholder->name(), 0, DT_STRING);
+  call_builder.Attr("device_ordinal", device_ordinal_value);
+  call_builder.Attr(kXlaHasHostTransferAttrName, true);
+  NodeDef call_def;
+  TF_RETURN_IF_ERROR(call_builder.Finalize(&call_def));
+  Status s;
+  Node* call_node = host_graph.AddNode(call_def, &s);
+  TF_RETURN_IF_ERROR(s);
+  host_graph.AddEdge(key_placeholder, 0, call_node, 0);
+
+  // Convert `host_graph` to function, and add a "device_ordinal" attr.
+  FunctionDef oc_host_graph_fdef;
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(host_graph, host_graph_func_name,
+                                        &oc_host_graph_fdef));
+  if (fld->Find(host_graph_func_name)) {
+    TF_RETURN_IF_ERROR(
+        fld->ReplaceFunction(host_graph_func_name, oc_host_graph_fdef));
+  } else {
+    TF_RETURN_IF_ERROR(fld->AddFunctionDef(oc_host_graph_fdef));
+  }
+
+  return Status::OK();
+}
+
 Status ExtractOutsideCompilationForNodesWithAssociatedFunctions(
     Graph* g, const string& xla_cluster_attr_name,
     const string& outside_compilation_attr_name, const string& xla_cluster_name,
-    const std::map<string, int>& host_compute_core,
+    const std::map<string, int>& host_compute_core, FunctionLibraryRuntime* flr,
     FunctionLibraryDefinition* fld, std::vector<string>* host_graphs,
     std::vector<string>* shape_inference_graphs,
     bool* has_outside_compilation) {
-  std::vector<Node*> if_nodes, while_nodes;
+  std::vector<Node*> if_nodes, while_nodes, func_call_nodes;
   for (Node* n : g->nodes()) {
     if (n->type_string() == "If") {
       if_nodes.push_back(n);
     } else if (n->type_string() == "While") {
       while_nodes.push_back(n);
+    } else if (fld->Contains(n->type_string())) {
+      func_call_nodes.push_back(n);
+    } else if (n->type_string() == FunctionLibraryDefinition::kGradientOp) {
+      // Only gradient for user-defined function should be considered as
+      // function call node.
+      NameAttrList original_func;
+      TF_RETURN_IF_ERROR(GetNodeAttr(
+          n->def(), FunctionLibraryDefinition::kFuncAttr, &original_func));
+      if (fld->Contains(original_func.name())) {
+        func_call_nodes.push_back(n);
+      }
+    }
+  }
+
+  for (Node* n : func_call_nodes) {
+    // Extract outside compilation for the function call.
+    bool func_has_outside_compilation = false;
+    NameAttrList func;
+    func.set_name(n->type_string());
+    typedef protobuf::Map<string, AttrValue> AttrMap;
+    *func.mutable_attr() = AttrMap(n->attrs().begin(), n->attrs().end());
+    string new_func_name = absl::StrCat(n->name(), "_oc");
+    string host_func_name = absl::StrCat("oc_func_call_host_", n->name());
+    TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
+        xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+        func, new_func_name, host_func_name, host_compute_core, flr, fld,
+        shape_inference_graphs, &func_has_outside_compilation));
+
+    // If the function call does not have outside compilation, nothing to do.
+    if (!func_has_outside_compilation) {
+      continue;
     }
+
+    *has_outside_compilation = true;
+
+    // Change `n` to call the new function directly.
+    NodeDefBuilder replace_builder(n->name(), new_func_name, fld);
+    for (const Edge* e : n->in_edges()) {
+      if (e->IsControlEdge()) {
+        continue;
+      }
+      replace_builder.Input(e->src()->name(), e->src_output(),
+                            e->src()->output_type(e->src_output()));
+    }
+    for (const auto& attr : n->attrs()) {
+      replace_builder.Attr(attr.first, attr.second);
+    }
+    NodeDef replace_def;
+    TF_RETURN_IF_ERROR(replace_builder.Finalize(&replace_def));
+    TF_ASSIGN_OR_RETURN(Node * replace, ReplaceNode(g, n, replace_def));
+    replace->AddAttr(kXlaTokenInputNodesAttrName,
+                     std::vector<string>{kXlaTokenArgNodeName});
+
+    // Build host side graph for the function call.
+    string oc_host_graph_name =
+        absl::StrCat("oc_func_host_graph_", replace->name());
+    TF_RETURN_IF_ERROR(
+        BuildHostGraphForFuncCallNode(replace->name(), xla_cluster_name,
+                                      host_func_name, oc_host_graph_name, fld));
+
+    // Record the host graph.
+    host_graphs->push_back(oc_host_graph_name);
   }
 
   for (Node* n : if_nodes) {
@@ -1251,12 +1369,12 @@ Status ExtractOutsideCompilationForNodesWithAssociatedFunctions(
     TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
         xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
         then_branch, then_branch_xla_func_name, then_branch_host_func_name,
-        host_compute_core, fld, shape_inference_graphs,
+        host_compute_core, flr, fld, shape_inference_graphs,
         &then_branch_has_outside_compilation));
     TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
         xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
         else_branch, else_branch_xla_func_name, else_branch_host_func_name,
-        host_compute_core, fld, shape_inference_graphs,
+        host_compute_core, flr, fld, shape_inference_graphs,
         &else_branch_has_outside_compilation));
 
     // If then/else branch do not have outside compilation, nothing to do.
@@ -1316,12 +1434,12 @@ Status ExtractOutsideCompilationForNodesWithAssociatedFunctions(
            body_xla_func_name = absl::StrCat(body.name(), "_oc");
     TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
         xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
-        cond, cond_xla_func_name, cond_host_func_name, host_compute_core, fld,
-        shape_inference_graphs, &cond_has_outside_compilation));
+        cond, cond_xla_func_name, cond_host_func_name, host_compute_core, flr,
+        fld, shape_inference_graphs, &cond_has_outside_compilation));
     TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
         xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
-        body, body_xla_func_name, body_host_func_name, host_compute_core, fld,
-        shape_inference_graphs, &body_has_outside_compilation));
+        body, body_xla_func_name, body_host_func_name, host_compute_core, flr,
+        fld, shape_inference_graphs, &body_has_outside_compilation));
 
     // If cond/body do not have outside compilation, nothing to do.
     if (!cond_has_outside_compilation && !body_has_outside_compilation) {
@@ -1469,17 +1587,27 @@ Status ExtractOutsideCompilationForFunction(
     const string& outside_compilation_attr_name, const string& xla_cluster_name,
     const NameAttrList& func_name_attrs, const string& new_func_name,
     const string& host_graph_func_name,
-    const std::map<string, int>& host_compute_core,
+    const std::map<string, int>& host_compute_core, FunctionLibraryRuntime* flr,
     FunctionLibraryDefinition* fld, std::vector<string>* shape_inference_graphs,
     bool* has_outside_compilation) {
+  // Convert the function to graph.
   const string& func_name = func_name_attrs.name();
-  const FunctionDef* fdef = fld->Find(func_name);
-  if (!fdef) {
-    return errors::Internal("Cannot find function ", func_name);
-  }
+  FunctionLibraryRuntime::Handle handle;
+  TF_RETURN_IF_ERROR(
+      flr->Instantiate(func_name, AttrSlice(&func_name_attrs.attr()), &handle));
+  Status ret_status = Status::OK();
+  auto cleanup_handle = gtl::MakeCleanup([&]() {
+    auto s = flr->ReleaseHandle(handle);
+    if (!s.ok()) {
+      ret_status.Update(s);
+    }
+  });
+  const FunctionBody* fbody = flr->GetFunctionBody(handle);
+
+  // Check if we have outside compilation nodes.
   *has_outside_compilation = false;
-  for (auto& node_def : fdef->node_def()) {
-    if (HasNodeAttr(node_def, outside_compilation_attr_name)) {
+  for (Node* n : fbody->graph->nodes()) {
+    if (HasNodeAttr(n->def(), outside_compilation_attr_name)) {
       *has_outside_compilation = true;
       break;
     }
@@ -1487,16 +1615,6 @@ Status ExtractOutsideCompilationForFunction(
   // We cannot early return here, because we might have outside compilation in
   // If/While function body.
 
-  // Convert the function to graph.
-  FunctionBody* fbody = nullptr;
-  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
-      *fld->Find(func_name), AttrSlice(&func_name_attrs.attr()), fld,
-      [&](const string& op, const OpDef** sig) {
-        return fld->LookUpOpDef(op, sig);
-      },
-      &fbody));
-  std::unique_ptr<FunctionBody> fbody_deleter(fbody);
-
   // Preprocess edges between different outside compilations. They will be
   // restored in `ConstructHostGraph()`.
   TF_RETURN_IF_ERROR(PreprocessEdgesBetweenOutsideCompilations(
@@ -1553,16 +1671,11 @@ Status ExtractOutsideCompilationForFunction(
     TF_RETURN_IF_ERROR(ReplaceOrRemoveOutsideCompilationCallNode(
         graph_out.get(), n, host_compute_core));
   }
-  if (VLOG_IS_ON(4)) {
-    dump_graph::DumpGraphToFile(
-        absl::StrCat("extract_outside_compilation_for_func_after_", func_name),
-        *graph_out, fld);
-  }
 
   // Handle nodes with associated functions.
   TF_RETURN_IF_ERROR(ExtractOutsideCompilationForNodesWithAssociatedFunctions(
       graph_out.get(), xla_cluster_attr_name, outside_compilation_attr_name,
-      xla_cluster_name, host_compute_core, fld,
+      xla_cluster_name, host_compute_core, flr, fld,
       &outside_compilation_host_graphs, shape_inference_graphs,
       has_outside_compilation));
 
@@ -1580,20 +1693,31 @@ Status ExtractOutsideCompilationForFunction(
   FunctionDef updated_fdef;
   TF_RETURN_IF_ERROR(
       GraphToFunctionDef(*graph_out, new_func_name, &updated_fdef));
+  const FunctionDef* original_fdef = fld->Find(func_name);
+  if (original_fdef) {
+    for (const auto& attr : original_fdef->attr()) {
+      (*updated_fdef.mutable_attr())[attr.first] = attr.second;
+    }
+  }
   if (fld->Find(new_func_name)) {
     TF_RETURN_IF_ERROR(fld->ReplaceFunction(new_func_name, updated_fdef));
   } else {
     TF_RETURN_IF_ERROR(fld->AddFunctionDef(updated_fdef));
   }
+  if (VLOG_IS_ON(4)) {
+    dump_graph::DumpGraphToFile(
+        absl::StrCat("extract_outside_compilation_for_func_after_", func_name),
+        *graph_out, fld);
+  }
 
-  return Status::OK();
+  return ret_status;
 }
 
 Status ExtractOutsideCompilation(
     const string& xla_cluster_attr_name,
     const string& outside_compilation_attr_name,
     const std::unordered_map<string, XlaClusterInfo>& clusters, Graph* g,
-    FunctionLibraryDefinition* fld) {
+    FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld) {
   if (VLOG_IS_ON(4)) {
     dump_graph::DumpGraphToFile("extract_outside_compilation_before", *g, fld);
   }
@@ -1610,7 +1734,7 @@ Status ExtractOutsideCompilation(
     TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
         xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
         func_name_attrs, func_name_attrs.name(), host_graph_func_name,
-        host_compute_core, fld, &shape_inference_graphs,
+        host_compute_core, flr, fld, &shape_inference_graphs,
         &has_outside_compilation));
     TF_RETURN_IF_ERROR(
         ExpandHostGraphIntoMainGraph(g, fld, host_graph_func_name, n));
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.h b/tensorflow/compiler/jit/extract_outside_compilation_pass.h
index e07e7c5dd0cd42ddd4d643d8b36583c82056bbb2..d64cc2a103ed040cbf413ac736f97f84459e869b 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass.h
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.h
@@ -89,7 +89,7 @@ Status ExtractOutsideCompilationForFunction(
     const string& outside_compilation_attr_name, const string& xla_cluster_name,
     const NameAttrList& func_name_attrs, const string& new_func_name,
     const string& host_graph_func_name,
-    const std::map<string, int>& host_compute_core,
+    const std::map<string, int>& host_compute_core, FunctionLibraryRuntime* flr,
     FunctionLibraryDefinition* fld, std::vector<string>* shape_inference_graphs,
     bool* has_outside_compilation);
 
@@ -101,7 +101,7 @@ Status ExtractOutsideCompilation(
     const string& xla_cluster_attr_name,
     const string& outside_compilation_attr_name,
     const std::unordered_map<string, XlaClusterInfo>& clusters, Graph* g,
-    FunctionLibraryDefinition* fld);
+    FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
index e9a89e34e0c7b04b4be34e367b2d0bf627c0061a..7c3a24feff81b21a5d2347d21fb80988bc3e6065 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/jit/encapsulate_util.h"
 #include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/function.h"
@@ -31,6 +32,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
 
@@ -222,7 +225,42 @@ TEST(RewriteOutsideCompilationSubgraphFnTest, ShapesInferred) {
   EXPECT_EQ(shapes[0].dim_size(), 1);
 }
 
-TEST(ExtractOutsideCompilationForFunctionTest, Basic) {
+class ExtractOutsideCompilationForFunctionTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    SessionOptions session_options;
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::AddDevices(
+        session_options, "/job:localhost/replica:0/task:0", &devices));
+    device_mgr_ = absl::make_unique<DeviceMgr>(std::move(devices));
+  }
+
+  Status ExtractOutsideCompilationTest(
+      const string &xla_cluster_attr_name,
+      const string &outside_compilation_attr_name,
+      const string &xla_cluster_name, const NameAttrList &func_name_attrs,
+      const string &new_func_name, const string &host_graph_func_name,
+      const std::map<string, int> &host_compute_core,
+      FunctionLibraryDefinition *fld,
+      std::vector<string> *shape_inference_graphs,
+      bool *has_outside_compilation) {
+    OptimizerOptions opts;
+    pflr_ = absl::make_unique<ProcessFunctionLibraryRuntime>(
+        device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, fld, opts,
+        /*default_thread_pool=*/nullptr, /*cluster_flr=*/nullptr);
+    auto flr = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
+    return ExtractOutsideCompilationForFunction(
+        xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+        func_name_attrs, new_func_name, host_graph_func_name, host_compute_core,
+        flr, fld, shape_inference_graphs, has_outside_compilation);
+  }
+
+ private:
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+};
+
+TEST_F(ExtractOutsideCompilationForFunctionTest, Basic) {
   // Build the XLA computation func.
   // "const0"
   // "identity0" = "const0" (outside compilation cluster "0")
@@ -256,7 +294,7 @@ TEST(ExtractOutsideCompilationForFunctionTest, Basic) {
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationForFunction(
+  TF_CHECK_OK(ExtractOutsideCompilationTest(
       "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
       host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
@@ -362,7 +400,7 @@ TEST(ExtractOutsideCompilationForFunctionTest, Basic) {
   }
 }
 
-TEST(ExtractOutsideCompilationForFunctionTest, NoHostGraph) {
+TEST_F(ExtractOutsideCompilationForFunctionTest, NoHostGraph) {
   // Build the XLA computation func.
   // "const0"
   FunctionDefLibrary fdl;
@@ -384,7 +422,7 @@ TEST(ExtractOutsideCompilationForFunctionTest, NoHostGraph) {
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationForFunction(
+  TF_CHECK_OK(ExtractOutsideCompilationTest(
       "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
       host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
@@ -406,7 +444,7 @@ TEST(ExtractOutsideCompilationForFunctionTest, NoHostGraph) {
   EXPECT_EQ(host_graph->num_nodes(), 2);
 }
 
-TEST(ExtractOutsideCompilationForFunctionTest, XlaHostComputeRemoved) {
+TEST_F(ExtractOutsideCompilationForFunctionTest, XlaHostComputeRemoved) {
   // Build the XLA computation func.
   // "const0"
   // "const1" (outside compilation cluster "0")
@@ -432,7 +470,7 @@ TEST(ExtractOutsideCompilationForFunctionTest, XlaHostComputeRemoved) {
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationForFunction(
+  TF_CHECK_OK(ExtractOutsideCompilationTest(
       "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
       host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
@@ -489,7 +527,7 @@ REGISTER_OP("XlaRecvFromHost")
     .Attr("key: string")
     .SetIsStateful();
 
-TEST(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
+TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
   // Build the XLA computation func.
   // "const0" (bool)
   // "const1" (int32)
@@ -555,7 +593,7 @@ TEST(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationForFunction(
+  TF_CHECK_OK(ExtractOutsideCompilationTest(
       "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
       host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
@@ -651,7 +689,7 @@ TEST(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
   }
 }
 
-TEST(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
+TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
   // Build the XLA computation func.
   // "const0" (bool)
   // "while0" (input = "const0", cond = "cond_fn", body = "body_fn")
@@ -714,7 +752,7 @@ TEST(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationForFunction(
+  TF_CHECK_OK(ExtractOutsideCompilationTest(
       "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
       host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
@@ -782,4 +820,162 @@ TEST(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
   }
 }
 
+TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInFunction) {
+  // Build the XLA computation func.
+  // "const0" (int32)
+  // "fn" (input = "const0")
+  FunctionDefLibrary fdl;
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output arg = ops::_Arg(s.WithOpName("arg"), DT_INT32, 0);
+    Output identity = ops::Identity(s.WithOpName("identity"), arg);
+    ops::_Retval retval(s.WithOpName("retval"), identity, 0);
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+    auto node_name_image = g->BuildNodeNameIndex();
+    node_name_image["identity"]->AddAttr("_oc", "0");
+    PartialTensorShape shape({2});
+    node_name_image["identity"]->AddAttr(
+        kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
+
+    FunctionDef *true_fn_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "fn", true_fn_fdef));
+  }
+  FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
+  {
+    std::unique_ptr<Graph> g(new Graph(&fld));
+
+    tensorflow::TensorProto tensor_proto;
+    tensor_proto.set_dtype(tensorflow::DT_INT32);
+    tensorflow::TensorShapeProto shape;
+    shape.add_dim()->set_size(2);
+    *tensor_proto.mutable_tensor_shape() = shape;
+    for (int i = 0; i < 2; ++i) {
+      tensor_proto.add_int_val(1);
+    }
+    NodeDef const_def;
+    TF_CHECK_OK(NodeDefBuilder("const", "Const")
+                    .Attr("dtype", DT_INT32)
+                    .Attr("value", tensor_proto)
+                    .Finalize(&const_def));
+    Status s;
+    Node *const_node = g->AddNode(const_def, &s);
+    TF_CHECK_OK(s);
+
+    NodeDef fn_def;
+    TF_CHECK_OK(NodeDefBuilder("fn", "fn", &fld)
+                    .Input("const", 0, DT_INT32)
+                    .Finalize(&fn_def));
+    Node *fn_node = g->AddNode(fn_def, &s);
+    TF_CHECK_OK(s);
+    g->AddEdge(const_node, 0, fn_node, 0);
+
+    NodeDef ret_def;
+    TF_CHECK_OK(NodeDefBuilder("ret", "_Retval")
+                    .Attr("index", 0)
+                    .Attr("T", DT_INT32)
+                    .Input("fn", 0, DT_INT32)
+                    .Finalize(&ret_def));
+    Node *ret_node = g->AddNode(ret_def, &s);
+    TF_CHECK_OK(s);
+    g->AddEdge(fn_node, 0, ret_node, 0);
+
+    FunctionDef *xla_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+    TF_CHECK_OK(fld.AddFunctionDef(*xla_fdef));
+  }
+
+  protobuf::Map<string, tensorflow::AttrValue> attrs;
+  std::map<string, int> host_compute_core;
+  std::vector<string> shape_inference_graphs;
+  bool has_outside_compilation;
+  NameAttrList name_attrs;
+  name_attrs.set_name("cluster");
+  *name_attrs.mutable_attr() = attrs;
+  TF_CHECK_OK(ExtractOutsideCompilationTest(
+      "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
+      host_compute_core, &fld, &shape_inference_graphs,
+      &has_outside_compilation));
+
+  // Check host graph.
+  {
+    FunctionBody *host_fbody = nullptr;
+    AttrValue device_ordinal_temp_value;
+    device_ordinal_temp_value.set_i(0);
+    protobuf::Map<string, AttrValue> host_func_attrs;
+    host_func_attrs["device_ordinal"] = device_ordinal_temp_value;
+    TF_CHECK_OK(FunctionDefToBodyHelper(
+        *fld.Find("host_graph"), AttrSlice(&host_func_attrs), &fld,
+        [&](const string &op, const OpDef **sig) {
+          return fld.LookUpOpDef(op, sig);
+        },
+        &host_fbody));
+    std::unique_ptr<FunctionBody> host_fbody_deleter(host_fbody);
+    Graph *host_graph = host_fbody->graph;
+    auto node_name_index = host_graph->BuildNodeNameIndex();
+
+    // Verify we have call node for outside compilation in `fn`.
+    Node *call_node = node_name_index["oc_call_fn"];
+    EXPECT_NE(call_node, nullptr);
+
+    FunctionBody *call_fbody = nullptr;
+    TF_CHECK_OK(FunctionDefToBodyHelper(
+        *fld.Find("oc_func_call_host_fn"), AttrSlice(&host_func_attrs), &fld,
+        [&](const string &op, const OpDef **sig) {
+          return fld.LookUpOpDef(op, sig);
+        },
+        &call_fbody));
+    std::unique_ptr<FunctionBody> call_fbody_deleter(call_fbody);
+
+    // Verify we have _XlaRecvAtHost and _XlaSendFromHost nodes.
+    bool has_recv = false, has_send = false;
+    for (Node *n : call_fbody->graph->nodes()) {
+      if (n->type_string() == "_XlaRecvAtHost") {
+        has_recv = true;
+      } else if (n->type_string() == "_XlaSendFromHost") {
+        has_send = true;
+      }
+    }
+    EXPECT_TRUE(has_recv);
+    EXPECT_TRUE(has_send);
+  }
+
+  // Check XLA graph.
+  {
+    FunctionBody *xla_fbody = nullptr;
+    TF_CHECK_OK(FunctionDefToBodyHelper(
+        *fld.Find("cluster_rewritten"), AttrSlice(), &fld,
+        [&](const string &op, const OpDef **sig) {
+          return fld.LookUpOpDef(op, sig);
+        },
+        &xla_fbody));
+    std::unique_ptr<FunctionBody> xla_fbody_deleter(xla_fbody);
+    Graph *xla_graph = xla_fbody->graph;
+    auto node_name_index = xla_graph->BuildNodeNameIndex();
+
+    // Check that we have call node.
+    Node *fn_node = node_name_index["fn"];
+    EXPECT_NE(fn_node, nullptr);
+    EXPECT_EQ(fn_node->type_string(), "fn_oc");
+
+    FunctionBody *call_fbody = nullptr;
+    TF_CHECK_OK(FunctionDefToBodyHelper(
+        *fld.Find("fn_oc"), AttrSlice(), &fld,
+        [&](const string &op, const OpDef **sig) {
+          return fld.LookUpOpDef(op, sig);
+        },
+        &call_fbody));
+    std::unique_ptr<FunctionBody> call_fbody_deleter(call_fbody);
+
+    // Verify we have XlaHostCompute nodes.
+    bool has_hc = false;
+    for (Node *n : call_fbody->graph->nodes()) {
+      if (n->type_string() == "XlaHostCompute") {
+        has_hc = true;
+      }
+    }
+    EXPECT_TRUE(has_hc);
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 6618e3a58ab7b6374ed775cd6e4e18a6a4975588..50afec020debe3aa87c317d658891d4e58762862 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -677,6 +677,11 @@ Status MarkForCompilationPass::Run(
   VLOG(1) << "flags->tf_xla_auto_jit = " << flags->tf_xla_auto_jit;
   const FunctionLibraryDefinition* fld = options.flib_def;
 
+  // Deadness analysis expects a graph with source and sink edges properly
+  // connected but sometimes the incoming graph does not follow this invariant.
+  // So fix up the source and sink edges before calling into deadness analysis.
+  FixupSourceAndSinkEdges(options.graph->get());
+
   std::unique_ptr<DeadnessAnalysis> deadness;
   {
     XLA_SCOPED_LOGGING_TIMER_LEVEL("DeadnessAnalysis", 1);
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 3df5479a55e841380ca7b8cdd0add9fd17487091..bff4cc57ee1f3ac0fc12aaa93b1588553aec8c45 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -38,6 +38,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+constexpr int64 XlaCompilationCache::kDefaultCompilationThreshold;
+
 XlaCompilationCache::XlaCompilationCache(xla::LocalClient* client,
                                          DeviceType device_type)
     : client_(client), device_type_(std::move(device_type)) {}
@@ -60,7 +62,7 @@ XlaCompilationCache::~XlaCompilationCache() {
   // about?
 }
 
-string XlaCompilationCache::DebugString() {
+string XlaCompilationCache::DebugString() const {
   return "XLA JIT compilation cache";
 }
 
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h
index 846d0c963dbfdf55f51120f2f138d12f5f63839b..02aa8f8839e2c033e06d043b0f17d89a08d5d9e6 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.h
+++ b/tensorflow/compiler/jit/xla_compilation_cache.h
@@ -88,7 +88,7 @@ class XlaCompilationCache : public ResourceBase {
   xla::LocalClient* client() const { return client_; }
   const DeviceType& device_type() const { return device_type_; }
 
-  string DebugString() override;
+  string DebugString() const override;
 
   // Describes the types, shapes and any compile-time constant arguments
   // to a kernel. Key that uniquely identifies a compilation output.
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index fa02cf9cbef45188a6dc2f861ff036649ea92b03..f80cb1812f00d36ddb7c28ae0e77c58498058ef3 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -230,6 +230,7 @@ tf_xla_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:standard_ops",
     ],
 )
 
@@ -677,6 +678,7 @@ tf_xla_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:standard_ops",
     ],
 )
 
@@ -826,6 +828,7 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python:standard_ops",
         "//tensorflow/python:stateless_random_ops",
     ],
 )
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index d8123e956fac04912b4fed5bf75cc9cb55c5baf9..0366ec45fb75a21b98ebfc4bdaa903bfa908de7a 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -669,6 +669,7 @@ cc_library(
     name = "side_effect_util",
     srcs = ["side_effect_util.cc"],
     hdrs = ["side_effect_util.h"],
+    visibility = [":friends"],
     deps = [
         "//tensorflow/core:core_cpu",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index efb75749722893100494e089c0beb96944e9f1d4..0c2bb0223905b22613a64ad54f07151f7f8590b2 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/side_effect_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
@@ -191,6 +192,9 @@ Status GraphCompiler::CompileFunctionalNode(Node* n,
   // into the functions.
   XlaOpKernelContext xla_op_context(op_context);
 
+  XlaContext& context = XlaContext::Get(op_context);
+  auto* b = context.builder();
+
   XlaCompiler* compiler = xla_op_context.compiler();
 
   NameAttrList func;
@@ -219,8 +223,12 @@ Status GraphCompiler::CompileFunctionalNode(Node* n,
   TF_RETURN_IF_ERROR(
       PrepareArguments(&xla_op_context, graph.get(), expressions, &arguments));
 
+  bool add_token_input_output =
+      HasNodeAttr(n->def(), kXlaTokenInputNodesAttrName);
+
   XlaCompiler::CompileOptions compile_options;
   compile_options.is_entry_computation = false;
+  compile_options.add_token_input_output = add_token_input_output;
   XlaCompiler::CompilationResult result;
   TF_RETURN_IF_ERROR(
       compiler->CompileFunction(compile_options, func, arguments, &result));
@@ -234,9 +242,19 @@ Status GraphCompiler::CompileFunctionalNode(Node* n,
     }
     handles.push_back(expressions[i]->handle());
   }
-
-  XlaContext& context = XlaContext::Get(op_context);
-  auto* b = context.builder();
+  if (add_token_input_output) {
+    std::vector<string> token_input_nodes;
+    TF_RETURN_IF_ERROR(
+        GetNodeAttr(n->def(), kXlaTokenInputNodesAttrName, &token_input_nodes));
+    std::vector<xla::XlaOp> token_inputs;
+    for (const string& node_name : token_input_nodes) {
+      auto token_or = compiler->GetNodeToken(node_name);
+      TF_RETURN_IF_ERROR(token_or.status());
+      token_inputs.push_back(token_or.ConsumeValueOrDie());
+    }
+    xla::XlaOp token_input = xla::AfterAll(b, token_inputs);
+    handles.push_back(token_input);
+  }
 
   auto output_handle = xla::Call(b, *result.computation, handles);
   // The output handle of `Call` computation is a tuple type. Unzip it so
@@ -251,6 +269,10 @@ Status GraphCompiler::CompileFunctionalNode(Node* n,
       ++computation_output;
     }
   }
+  if (add_token_input_output) {
+    TF_RETURN_IF_ERROR(compiler->SetNodeToken(
+        n->name(), xla::GetTupleElement(output_handle, computation_output)));
+  }
   return b->first_error();
 }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
index 7199b9b6feb36dd45ef51f4c38463bc715fcc38a..c2b4c28d1566f5429c5d8109db94af0c3762b131 100644
--- a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
@@ -99,8 +99,8 @@ class CategoricalOp : public XlaOpKernel {
     xla::PrimitiveType xla_output_type;
     OP_REQUIRES_OK(ctx,
                    DataTypeToPrimitiveType(output_type(0), &xla_output_type));
-    xla::XlaOp argmax = XlaHelpers::ArgMax(softmax_entries, xla_output_type,
-                                           /*axis=*/class_dimension);
+    xla::XlaOp argmax = xla::ArgMax(softmax_entries, xla_output_type,
+                                    /*axis=*/class_dimension);
     if (num_samples == 1) {
       argmax = xla::Reshape(argmax, {batch_size, 1});
     }
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
index b0bc7640307149459a29e6b0b2e8e8132e4141c9..5b4f863f7418ecda0db502ce25fed2d0042bf3ca 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
@@ -212,8 +212,8 @@ Status ConvBackpropComputeDimensionsV2XlaShapes(
       XLAShapeToTensorShape(out_backprop_shape, &out_backprop_tensor_shape));
   return ConvBackpropComputeDimensionsV2(
       label, num_spatial_dims, input_tensor_shape, filter_tensor_shape,
-      out_backprop_tensor_shape, dilations, strides, padding, data_format,
-      dims);
+      out_backprop_tensor_shape, dilations, strides, padding,
+      /*explicit_paddings=*/{}, data_format, dims);
 }
 
 }  // anonymous namespace
@@ -227,6 +227,11 @@ xla::StatusOr<ConvOpAttrs> ConvOpAttrs::Create(int num_spatial_dims,
   TF_RETURN_IF_ERROR(ctx->GetAttr("dilations", &attrs.dilations));
   TF_RETURN_IF_ERROR(ctx->GetAttr("strides", &attrs.strides));
   TF_RETURN_IF_ERROR(ctx->GetAttr("padding", &attrs.padding));
+  // TODO(reedwm): Support explicit padding.
+  if (attrs.padding == EXPLICIT) {
+    return errors::Unimplemented(
+        "XLA does not yet support Conv2D with explicit padding.");
+  }
 
   string data_format;
   TF_RETURN_IF_ERROR(ctx->GetAttr("data_format", &data_format));
@@ -428,11 +433,8 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
   int n_dim = GetTensorBatchDimIndex(num_dims, attrs.data_format);
   int c_dim = GetTensorFeatureDimIndex(num_dims, attrs.data_format);
 
-  // The conversion logic below assumes that the data format is NHWC, so we also
-  // check that here.
   bool use_batch_group_count =
-      filter_tensor_shape.dim_size(num_dims - 1) == 1 && attrs.depthwise &&
-      attrs.data_format == FORMAT_NHWC;
+      filter_tensor_shape.dim_size(num_dims - 1) == 1 && attrs.depthwise;
 
   std::vector<std::pair<int64, int64>> padding(attrs.num_spatial_dims);
   std::vector<int64> rhs_dilation(attrs.num_spatial_dims);
diff --git a/tensorflow/compiler/tf2xla/kernels/image_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
index 96ddd42e2ae04d454e4fb85628d139e17a543d2e..a31b5a2cfd75f16f416615e9dc6ecca8515d35c3 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h"
 #include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
@@ -351,24 +352,26 @@ struct SuppressBodyFn {
     auto num_outputs_so_far = values[1];
     auto iou_mask = values[2];
     auto included_iou = values[3];
-    auto zero_r1 = xla::ConstantR1<int32>(builder, {0});
+    auto zero = xla::ConstantR0<int32>(builder, 0);
     // Determine if current elem is active using a slice.
-    auto row_idx_r1 = xla::Reshape(row_idx, {1});
-    auto active_elem = xla::DynamicSlice(included_iou, row_idx_r1, {1});
+    // TODO(b/118437727): The only reason we need an explicit vector is because
+    // some old GCCs can't deduce the right type for MakeConstSpan, and
+    // providing a single-value initializer list directly uses the wrong
+    // overload. Delete this once the deprecated overload is gone.
+    std::vector<xla::XlaOp> row_idx_vector = {row_idx};
+    auto active_elem = xla::DynamicSlice(included_iou, row_idx_vector, {1});
     active_elem = xla::Reshape(active_elem, {});
     // Increment output count iff current elem is not suppressed.
     num_outputs_so_far = xla::Select(
         active_elem, num_outputs_so_far + xla::ConstantR0<int32>(builder, 1),
         num_outputs_so_far);
     // Slice out the row_idx.
-    auto starts = xla::ConcatInDim(builder, {row_idx_r1, zero_r1}, 0);
-    auto row_iou = xla::DynamicSlice(iou_mask, starts, {1, num_boxes});
+    auto row_iou = xla::DynamicSlice(iou_mask, {row_idx, zero}, {1, num_boxes});
     // Remove the diagonal from consideration. An elem cannot suppress
     // itself.
-    auto update_starts = xla::ConcatInDim(builder, {zero_r1, row_idx_r1}, 0);
     row_iou = xla::DynamicUpdateSlice(
         row_iou, xla::ConstantR2FromArray2D<bool>(builder, {{false}}),
-        update_starts);
+        {zero, row_idx});
     // Create a suppression by inverting polarity.
     row_iou = xla::Reshape(row_iou, {num_boxes});
     auto supp_mask = xla::Not(row_iou);
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops.cc b/tensorflow/compiler/tf2xla/kernels/index_ops.cc
index 843b6bb4e658af16fd753c1a20b35dd3d18df027..978e9480eac5b522d1ee2d51a61841c6f1bbba0c 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/kernels/index_ops.h"
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
@@ -66,9 +65,9 @@ void XlaArgMinMaxOp::Compile(XlaOpKernelContext* ctx) {
   xla::XlaOp input = ctx->Input(0);
   xla::XlaOp output;
   if (is_min_) {
-    output = XlaHelpers::ArgMin(input, index_xla_type, axis);
+    output = xla::ArgMin(input, index_xla_type, axis);
   } else {
-    output = XlaHelpers::ArgMax(input, index_xla_type, axis);
+    output = xla::ArgMax(input, index_xla_type, axis);
   }
 
   ctx->SetOutput(0, output);
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
index 3e7e8eae6ed406003be2843305bb6b173845d6cc..30b993045c86c6d01f8eabe55986f132f8938643 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
@@ -16,9 +16,9 @@ limitations under the License.
 // Native XLA implementations of indexing ops.
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -74,7 +74,7 @@ class ArgMaxCustomCallOp : public XlaOpKernel {
     // shape isn't supported.
     if (!ctx->compiler()->options().allow_cpu_custom_calls ||
         (input_dims != 1 && input_dims != 2)) {
-      xla::XlaOp output = XlaHelpers::ArgMax(ctx->Input(0), output_type, axis);
+      xla::XlaOp output = xla::ArgMax(ctx->Input(0), output_type, axis);
       ctx->SetOutput(0, output);
       return;
     }
diff --git a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
index 02d71a394280a67cc264450021f91ba475aef7fc..d0c5231e843aefa68490e29475ee96bd92859aac 100644
--- a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
@@ -146,9 +146,9 @@ class StackPushOp : public XlaOpKernel {
     xla::XlaOp value = ctx->Input(1);
 
     // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
-    auto start_indices =
-        xla::Pad(xla::Reshape(index, {1}), xla::ConstantR0<int32>(b, 0),
-                 xla::MakeEdgePaddingConfig({{0, elem_shape.dims()}}));
+    std::vector<xla::XlaOp> start_indices(elem_shape.dims() + 1,
+                                          xla::ConstantR0<int32>(b, 0));
+    start_indices[0] = index;
 
     TensorShape slice_shape = elem_shape;
     slice_shape.InsertDim(0, 1LL);
@@ -202,9 +202,9 @@ class StackPopOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, resource->SetValue(xla::Tuple(b, {ta, index})));
 
     // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
-    auto start_indices =
-        xla::Pad(xla::Reshape(index, {1}), xla::ConstantR0<int32>(b, 0),
-                 xla::MakeEdgePaddingConfig({{0, stack_shape.dims() - 1}}));
+    std::vector<xla::XlaOp> start_indices(stack_shape.dims(),
+                                          xla::ConstantR0<int32>(b, 0));
+    start_indices[0] = index;
 
     auto slice_shape = stack_shape.dim_sizes();
     slice_shape[0] = 1LL;
diff --git a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
index 2c92a585f5679242d672d0402e617ff199b94f17..dfa09b16081e93ba843a1858e68e6ff756de20c1 100644
--- a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
@@ -291,5 +291,19 @@ class ResourceScatterNdAddOp : public ResourceScatterOp {
 };
 REGISTER_XLA_OP(Name("ResourceScatterNdAdd"), ResourceScatterNdAddOp);
 
+class ResourceScatterNdSubOp : public ResourceScatterOp {
+ public:
+  explicit ResourceScatterNdSubOp(OpKernelConstruction* context)
+      : ResourceScatterOp(context, /*indices_are_vectors=*/true,
+                          /*combiner=*/Combine) {}
+
+ private:
+  static xla::XlaOp Combine(const xla::XlaOp& x, const xla::XlaOp& y,
+                            xla::XlaBuilder* builder) {
+    return xla::Sub(x, y);
+  }
+};
+REGISTER_XLA_OP(Name("ResourceScatterNdSub"), ResourceScatterNdSubOp);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
index b32683a682c3a6828e92572f444022dbfc84165f..941b04363f8386a7bdbe8c91ea34c9754592a52d 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -291,20 +291,15 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
 
   xla::XlaOp while_result = xla::While(cond_wrapper, *body.computation, init);
 
-  auto while_shape_or = builder->GetShape(while_result);
-  OP_REQUIRES_OK(ctx, while_shape_or.status());
-  auto count = xla::ShapeUtil::TupleElementCount(while_shape_or.ValueOrDie());
-  int max_index = body.outputs.size() + body.resource_updates.size() - 1;
-  OP_REQUIRES(
-      ctx, max_index < count,
-      errors::Internal("Max tuple element requested (", max_index,
-                       ") needs to be less than tuple size (", count, ")"));
-
-  // Sets non-variable outputs.
+  // Sets non-variable outputs and determine when resource variables start.
+  int resource_index = 0;
   for (int i = 0; i < ctx->num_outputs(); ++i) {
     if (ctx->input_type(i) != DT_RESOURCE) {
       ctx->SetOutput(body.input_mapping[i],
                      xla::GetTupleElement(while_result, i));
+      ++resource_index;
+    } else {
+      break;
     }
   }
   if (has_token_input_output_) {
@@ -326,7 +321,7 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
     XlaResource* resource;
     OP_REQUIRES_OK(ctx, ctx->GetResourceInput(update.input_index, &resource));
     if (update.modified) {
-      int pos = body.outputs.size() + i;
+      int pos = resource_index + i;
       OP_REQUIRES_OK(ctx,
                      resource->SetFromPack(
                          arguments[update.input_index].tensor_array_gradients,
diff --git a/tensorflow/compiler/tf2xla/ops/BUILD b/tensorflow/compiler/tf2xla/ops/BUILD
index 4dce0a2102cf9c782850ccc7af4f14b59bd51e53..7140b6a1227a53290c3747892a55886a7f48513b 100644
--- a/tensorflow/compiler/tf2xla/ops/BUILD
+++ b/tensorflow/compiler/tf2xla/ops/BUILD
@@ -4,7 +4,11 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_custom_op_library",
+    "tf_gen_op_wrapper_py",
+)
 
 cc_library(
     name = "xla_ops",
@@ -24,3 +28,14 @@ tf_gen_op_wrapper_py(
         ":xla_ops",
     ],
 )
+
+tf_custom_op_library(
+    name = "_xla_ops.so",
+    srcs = [
+        "xla_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
diff --git a/tensorflow/compiler/tf2xla/python/BUILD b/tensorflow/compiler/tf2xla/python/BUILD
index fef97b98c376d9df8bbfd9cb6651216895e46bf4..9abdb04d7736e8ff5225688af4759a522d3e7fc7 100644
--- a/tensorflow/compiler/tf2xla/python/BUILD
+++ b/tensorflow/compiler/tf2xla/python/BUILD
@@ -15,6 +15,7 @@ load(
     "//tensorflow/core:platform/default/build_config.bzl",
     "tf_py_clif_cc",
 )
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 tf_py_clif_cc(
     name = "xla_op_registry",
@@ -27,9 +28,13 @@ tf_py_clif_cc(
     ],
 )
 
-py_library(
+tf_custom_op_py_library(
     name = "xla",
     srcs = ["xla.py"],
+    dso = ["//tensorflow/compiler/tf2xla/ops:_xla_ops.so"],
+    kernels = [
+        "//tensorflow/compiler/tf2xla/ops:xla_ops",
+    ],
     deps = [
         "//tensorflow/compiler/tf2xla/ops:gen_xla_ops",
         "//tensorflow/compiler/xla:xla_data_proto_py",
diff --git a/tensorflow/compiler/tf2xla/resource_operation_table.cc b/tensorflow/compiler/tf2xla/resource_operation_table.cc
index ff9f1b9ccba2c4f3307890d5aac4ddb6cfaafcd9..c20d6a5fd1f3bd7dad30cb3359d13ed4609a2250 100644
--- a/tensorflow/compiler/tf2xla/resource_operation_table.cc
+++ b/tensorflow/compiler/tf2xla/resource_operation_table.cc
@@ -77,6 +77,7 @@ CreateResourceOpInfoMap() {
   add("ResourceScatterMin"                   , kReadWrite, kVariable);
   add("ResourceScatterMul"                   , kReadWrite, kVariable);
   add("ResourceScatterNdAdd"                 , kReadWrite, kVariable);
+  add("ResourceScatterNdSub"                 , kReadWrite, kVariable);
   add("ResourceScatterNdUpdate"              , kReadWrite, kVariable);
   add("ResourceScatterSub"                   , kReadWrite, kVariable);
   add("ResourceScatterUpdate"                , kReadWrite, kVariable);
diff --git a/tensorflow/compiler/tf2xla/side_effect_util.cc b/tensorflow/compiler/tf2xla/side_effect_util.cc
index b62f8e9115229ac35c657d374c68336f1168ff77..412f31adbb7df52b2d6933be054cc6d40947dc44 100644
--- a/tensorflow/compiler/tf2xla/side_effect_util.cc
+++ b/tensorflow/compiler/tf2xla/side_effect_util.cc
@@ -26,6 +26,49 @@ const char kXlaTokenArgNodeName[] = "_xla_token_arg_node";
 
 const char kXlaHasHostTransferAttrName[] = "_xla_has_host_transfer";
 
+Status SetDeviceOrdinalAttributeForNode(Node* node, int device_ordinal) {
+  if (!HasNodeAttr(node->def(), kXlaHasHostTransferAttrName)) {
+    return errors::InvalidArgument("Node ", node->DebugString(),
+                                   " does not have attribute ",
+                                   kXlaHasHostTransferAttrName);
+  }
+
+  if (node->type_string() == "_XlaRecvAtHost" ||
+      node->type_string() == "_XlaSendFromHost") {
+    node->ClearAttr("device_ordinal");
+    node->AddAttr("device_ordinal", device_ordinal);
+  } else if (node->type_string() == "If") {
+    AttrValue device_ordinal_value;
+    device_ordinal_value.set_i(device_ordinal);
+    for (const string& attr_name :
+         std::vector<string>{"then_branch", "else_branch"}) {
+      NameAttrList branch_func;
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), attr_name, &branch_func));
+      (*branch_func.mutable_attr())["device_ordinal"] = device_ordinal_value;
+      node->ClearAttr(attr_name);
+      node->AddAttr(attr_name, branch_func);
+    }
+  } else if (node->type_string() == "While") {
+    AttrValue device_ordinal_value;
+    device_ordinal_value.set_i(device_ordinal);
+    for (const string& attr_name : std::vector<string>{"cond", "body"}) {
+      NameAttrList branch_func;
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), attr_name, &branch_func));
+      (*branch_func.mutable_attr())["device_ordinal"] = device_ordinal_value;
+      node->ClearAttr(attr_name);
+      node->AddAttr(attr_name, branch_func);
+    }
+  } else if (HasNodeAttr(node->def(), "device_ordinal")) {
+    // Function call node containing outside compilation.
+    node->ClearAttr("device_ordinal");
+    node->AddAttr("device_ordinal", device_ordinal);
+  } else {
+    return errors::Internal("Unknown node type to set 'device_ordinal': ",
+                            node->DebugString());
+  }
+  return Status::OK();
+}
+
 std::set<std::string> CalculateTokenInputsForOutputToken(const Graph& g) {
   std::set<std::string> results;
   Node* first_side_effecting_node_on_path = nullptr;
diff --git a/tensorflow/compiler/tf2xla/side_effect_util.h b/tensorflow/compiler/tf2xla/side_effect_util.h
index 7081b362c36c4785164b29003a5f89cd73bcf3af..75e1f253fb08ae61b0336a8783b7449c69197dd1 100644
--- a/tensorflow/compiler/tf2xla/side_effect_util.h
+++ b/tensorflow/compiler/tf2xla/side_effect_util.h
@@ -38,6 +38,10 @@ extern const char kXlaTokenArgNodeName[];
 // This node have XlaRecvAtHost/XlaSendFromHost in its associated functions.
 extern const char kXlaHasHostTransferAttrName[];
 
+// Sets device ordinal attribute for nodes with attribute
+// `kXlaHasHostTransferAttrName`.
+Status SetDeviceOrdinalAttributeForNode(Node* node, int device_ordinal);
+
 // Calculates side-effect dependencies for the graph's token output.
 // Returns a set of node names representing these dependencies.
 std::set<std::string> CalculateTokenInputsForOutputToken(const Graph& g);
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 32a430184099e8bac28182ea4c7790ea1b8266de..492010f7317d32a8a620147cd2cd9356d4f13fde 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -82,7 +82,7 @@ namespace {
 // compiled kernels.
 class DummyResourceForTest : public ResourceBase {
  public:
-  string DebugString() override { return "dummy"; }
+  string DebugString() const override { return "dummy"; }
   void Increment() { ++value_; }
   int Get() { return value_; }
 
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index a69af70503376b6c0905deb8980abdc3254a6e47..6139bf3cea0790c2697130a993e92be96c81848b 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -61,7 +61,7 @@ void XlaContext::set_args(std::vector<XlaExpression> args) {
 XlaContext::XlaContext(XlaCompiler* compiler, xla::XlaBuilder* builder)
     : compiler_(compiler), builder_(builder) {}
 
-string XlaContext::DebugString() { return "XLA JIT context"; }
+string XlaContext::DebugString() const { return "XLA JIT context"; }
 
 void XlaContext::SetRetval(int index, const XlaExpression& expression) {
   if (retvals_.size() <= index) {
diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h
index 0767d1faac14cedb8666f6cc37175eb7b55f6158..eb4ad3fe6a14b42a4df2c73c71cb6df1331fd796 100644
--- a/tensorflow/compiler/tf2xla/xla_context.h
+++ b/tensorflow/compiler/tf2xla/xla_context.h
@@ -47,7 +47,7 @@ class XlaContext : public ResourceBase {
   XlaContext(XlaCompiler* compiler, xla::XlaBuilder* builder);
 
   // Virtual method defined by ResourceBase.
-  string DebugString() override;
+  string DebugString() const override;
 
   XlaCompiler* compiler() const { return compiler_; }
 
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index 00035d24b7891060adbc5ff871356b7471ab92ef..04a5d934064a9083a41cc210b48df65bbc862fff 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -34,63 +34,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-namespace {
-
-xla::XlaOp ArgMinMax(xla::XlaOp input, xla::PrimitiveType output_type, int axis,
-                     bool is_min) {
-  xla::XlaBuilder* builder = input.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape input_shape, builder->GetShape(input));
-    xla::XlaOp init_value;
-    xla::XlaComputation reducer;
-    if (is_min) {
-      init_value = xla::MaxValue(builder, input_shape.element_type());
-      reducer =
-          xla::CreateScalarMinComputation(input_shape.element_type(), builder);
-    } else {
-      init_value = xla::MinValue(builder, input_shape.element_type());
-      reducer =
-          xla::CreateScalarMaxComputation(input_shape.element_type(), builder);
-    }
-
-    xla::XlaOp input_max = xla::Reduce(input, init_value, reducer,
-                                       /*dimensions_to_reduce=*/{axis});
-    std::vector<int64> broadcast_dims(input_shape.rank() - 1);
-    std::iota(broadcast_dims.begin(), broadcast_dims.begin() + axis, 0);
-    std::iota(broadcast_dims.begin() + axis, broadcast_dims.end(), axis + 1);
-    // Compute a mask that has 1s for elements equal to the maximum.
-    xla::XlaOp partial_mask = xla::ConvertElementType(
-        xla::Eq(input, input_max, broadcast_dims), output_type);
-
-    // In order to make identity elements for a bitwise And, we:
-    //   Left shift the 1 to the leftmost bit, yielding 0x10...0
-    //   Arithmetic right shift the 1 back to the rightmost bit, yielding
-    //   0xFF...F
-    int32 bits_in_type =
-        xla::ShapeUtil::ByteSizeOfPrimitiveType(output_type) * 8 - 1;
-    xla::XlaOp shift_amount =
-        xla::ConstantR0WithType(builder, output_type, bits_in_type);
-    xla::XlaOp full_mask = xla::ShiftRightArithmetic(
-        xla::ShiftLeft(partial_mask, shift_amount), shift_amount);
-
-    // And with the vector [0, 1, 2, ...] to convert each 0xFF...F into its
-    // index.
-
-    const int64 axis_size = xla::ShapeUtil::GetDimension(input_shape, axis);
-    xla::XlaOp iota = xla::Iota(builder, output_type, axis_size);
-    xla::XlaOp product =
-        xla::And(full_mask, iota, /*broadcast_dimensions=*/{axis});
-
-    // If there are multiple maximum elements, choose the one with the highest
-    // index.
-    return xla::Reduce(product, xla::MinValue(builder, output_type),
-                       xla::CreateScalarMaxComputation(output_type, builder),
-                       /*dimensions_to_reduce=*/{axis});
-  });
-}
-
-}  // namespace
-
 xla::XlaOp XlaHelpers::Zero(xla::XlaBuilder* b, DataType data_type) {
   xla::PrimitiveType type;
   TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
@@ -148,16 +91,6 @@ static Tensor MakeLinspaceTensor(const TensorShape& shape, int64 depth) {
   return linspace;
 }
 
-xla::XlaOp XlaHelpers::ArgMax(xla::XlaOp input, xla::PrimitiveType output_type,
-                              int axis) {
-  return ArgMinMax(input, output_type, axis, /*is_min=*/false);
-}
-
-xla::XlaOp XlaHelpers::ArgMin(xla::XlaOp input, xla::PrimitiveType output_type,
-                              int axis) {
-  return ArgMinMax(input, output_type, axis, /*is_min=*/true);
-}
-
 Status XlaHelpers::OneHot(xla::XlaBuilder* builder, int64 depth, int axis,
                           DataType index_type, const TensorShape& indices_shape,
                           const xla::XlaOp& indices, const xla::XlaOp& on_value,
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.h b/tensorflow/compiler/tf2xla/xla_helpers.h
index 4858dfee55a393d04cd2af83916eeb40820ee368..490923526bd3acd4b167ccb3faff1d6c9e631131 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.h
+++ b/tensorflow/compiler/tf2xla/xla_helpers.h
@@ -53,16 +53,6 @@ class XlaHelpers {
                                absl::Span<const int64> shape,
                                xla::Literal* output);
 
-  // Returns the argmax of `input` along `axis`. `output_type` is the type to
-  // use for the output.
-  static xla::XlaOp ArgMax(xla::XlaOp input, xla::PrimitiveType output_type,
-                           int axis);
-
-  // Returns the argmin of `input` along `axis`. `output_type` is the type to
-  // use for the output.
-  static xla::XlaOp ArgMin(xla::XlaOp input, xla::PrimitiveType output_type,
-                           int axis);
-
   // Converts `indices` into a one-hot representation. `depth` is the size
   // of the new axis to add. `axis` is the position at which to add the new
   // axis. `indices_shape` is the shape of `indices`. `on_value` and
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 722d1376687efa1c04158e3fd9ce539aac9d0122..aa43fc8dabf81fd44fc1b7c10f5d0501b1a55af3 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -152,7 +152,7 @@ cc_library(
         ":status",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor/lib",
     ],
 )
 
@@ -717,6 +717,7 @@ cc_library(
         ":types",
         ":xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
@@ -741,6 +742,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_evaluator",
         "//tensorflow/compiler/xla/service:shape_inference",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
     ],
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index 97fb4b9e0ee8bba23fc979cf02fc2ab79c7df1a7..df1ee330f1eb0a4556d5ec4d333c5dc4271fc5f7 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -34,6 +34,21 @@ cc_library(
     ],
 )
 
+xla_test(
+    name = "arithmetic_test",
+    srcs = ["arithmetic_test.cc"],
+    deps = [
+        ":arithmetic",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 cc_library(
     name = "cholesky",
     srcs = ["cholesky.cc"],
@@ -93,7 +108,6 @@ cc_library(
 xla_test(
     name = "constants_test",
     srcs = ["constants_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         ":constants",
         "//tensorflow/compiler/xla:test",
@@ -147,7 +161,6 @@ cc_library(
 xla_test(
     name = "math_test",
     srcs = ["math_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         ":math",
         "//tensorflow/compiler/xla:literal_util",
@@ -181,7 +194,6 @@ cc_library(
 xla_test(
     name = "matrix_test",
     srcs = ["matrix_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         ":matrix",
         ":slicing",
@@ -295,7 +307,6 @@ cc_library(
 xla_test(
     name = "slicing_test",
     srcs = ["slicing_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         ":slicing",
         "//tensorflow/compiler/xla:literal_util",
@@ -324,7 +335,6 @@ cc_library(
 xla_test(
     name = "sorting_test",
     srcs = ["sorting_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         ":sorting",
         "//tensorflow/compiler/xla:test",
@@ -352,7 +362,10 @@ cc_library(
 xla_test(
     name = "quantize_test",
     srcs = ["quantize_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
+    # TODO(b/122119490): re-enable TAP after fixing.
+    tags = [
+        "notap",
+    ],
     deps = [
         ":quantize",
         "//tensorflow/compiler/xla:test",
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.cc b/tensorflow/compiler/xla/client/lib/arithmetic.cc
index 33ff3971d7277e619db4bec99ffdf9c0c9d230ad..3b875135af29f142463ffd783bfeaadc61ada1af 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.cc
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.cc
@@ -123,4 +123,64 @@ XlaOp Any(XlaOp predicates) {
   });
 }
 
+namespace {
+
+XlaOp ArgMinMax(XlaOp input, PrimitiveType output_type, int axis, bool is_min) {
+  XlaBuilder* builder = input.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape input_shape, builder->GetShape(input));
+    XlaOp init_value;
+    XlaComputation reducer;
+    if (is_min) {
+      init_value = MaxValue(builder, input_shape.element_type());
+      reducer = CreateScalarMinComputation(input_shape.element_type(), builder);
+    } else {
+      init_value = MinValue(builder, input_shape.element_type());
+      reducer = CreateScalarMaxComputation(input_shape.element_type(), builder);
+    }
+
+    XlaOp input_max = Reduce(input, init_value, reducer,
+                             /*dimensions_to_reduce=*/{axis});
+    std::vector<int64> broadcast_dims(input_shape.rank() - 1);
+    std::iota(broadcast_dims.begin(), broadcast_dims.begin() + axis, 0);
+    std::iota(broadcast_dims.begin() + axis, broadcast_dims.end(), axis + 1);
+    // Compute a mask that has 1s for elements equal to the maximum.
+    XlaOp partial_mask =
+        ConvertElementType(Eq(input, input_max, broadcast_dims), output_type);
+
+    // In order to make identity elements for a bitwise And, we:
+    //   Left shift the 1 to the leftmost bit, yielding 0x10...0
+    //   Arithmetic right shift the 1 back to the rightmost bit, yielding
+    //   0xFF...F
+    int32 bits_in_type =
+        ShapeUtil::ByteSizeOfPrimitiveType(output_type) * 8 - 1;
+    XlaOp shift_amount = ConstantR0WithType(builder, output_type, bits_in_type);
+    XlaOp full_mask = ShiftRightArithmetic(
+        ShiftLeft(partial_mask, shift_amount), shift_amount);
+
+    // And with the vector [0, 1, 2, ...] to convert each 0xFF...F into its
+    // index.
+
+    const int64 axis_size = ShapeUtil::GetDimension(input_shape, axis);
+    XlaOp iota = Iota(builder, output_type, axis_size);
+    XlaOp product = And(full_mask, iota, /*broadcast_dimensions=*/{axis});
+
+    // If there are multiple maximum elements, choose the one with the highest
+    // index.
+    return Reduce(product, MinValue(builder, output_type),
+                  CreateScalarMaxComputation(output_type, builder),
+                  /*dimensions_to_reduce=*/{axis});
+  });
+}
+
+}  // namespace
+
+XlaOp ArgMax(XlaOp input, PrimitiveType output_type, int axis) {
+  return ArgMinMax(input, output_type, axis, /*is_min=*/false);
+}
+
+XlaOp ArgMin(XlaOp input, PrimitiveType output_type, int axis) {
+  return ArgMinMax(input, output_type, axis, /*is_min=*/true);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.h b/tensorflow/compiler/xla/client/lib/arithmetic.h
index 632e8cc8bc64fad236a0226c6e93079aadde7050..d4a7812c441c351b121e5d72faf9642b06728b18 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.h
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.h
@@ -57,6 +57,14 @@ XlaComputation CreateScalarOrComputation(PrimitiveType type,
 // Note: if predicates is zero-sized, Any() vacuously returns false.
 XlaOp Any(XlaOp predicates);
 
+// Returns the argmax of `input` along `axis`. `output_type` is the type to
+// use for the output.
+XlaOp ArgMax(XlaOp input, PrimitiveType output_type, int axis);
+
+// Returns the argmin of `input` along `axis`. `output_type` is the type to
+// use for the output.
+XlaOp ArgMin(XlaOp input, PrimitiveType output_type, int axis);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_ARITHMETIC_H_
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic_test.cc b/tensorflow/compiler/xla/client/lib/arithmetic_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a13839f9db89b9c07f2465867a503ef2193f8160
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/arithmetic_test.cc
@@ -0,0 +1,67 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+using ArithmeticTest = ClientLibraryTestBase;
+
+XLA_TEST_F(ArithmeticTest, ArgMinR2Axis0) {
+  XlaBuilder builder(TestName());
+  auto x = ConstantR2<int32>(&builder, {{1, 7, 4}, {6, 3, 5}, {8, 3, 3}});
+  ArgMin(x, S32, /*axis=*/0);
+
+  std::vector<int32> expected = {0, 2, 2};
+  ComputeAndCompareR1<int32>(&builder, expected, {});
+}
+
+XLA_TEST_F(ArithmeticTest, ArgMinR2Axis1) {
+  XlaBuilder builder(TestName());
+  auto x = ConstantR2<int32>(&builder, {{1, 7, 4}, {6, 3, 5}, {8, 3, 3}});
+  ArgMin(x, S32, /*axis=*/1);
+
+  std::vector<int32> expected = {0, 1, 2};
+  ComputeAndCompareR1<int32>(&builder, expected, {});
+}
+
+XLA_TEST_F(ArithmeticTest, ArgMaxR2Axis0) {
+  XlaBuilder builder(TestName());
+  auto x = ConstantR2<int32>(&builder, {{1, 7, 4}, {6, 3, 5}, {8, 3, 3}});
+  ArgMax(x, S32, /*axis=*/0);
+
+  std::vector<int32> expected = {2, 0, 1};
+  ComputeAndCompareR1<int32>(&builder, expected, {});
+}
+
+XLA_TEST_F(ArithmeticTest, ArgMaxR2Axis1) {
+  XlaBuilder builder(TestName());
+  auto x = ConstantR2<int32>(&builder, {{1, 7, 4}, {6, 3, 5}, {8, 3, 3}});
+  ArgMax(x, S32, /*axis=*/1);
+
+  std::vector<int32> expected = {1, 0, 0};
+  ComputeAndCompareR1<int32>(&builder, expected, {});
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/cholesky.cc b/tensorflow/compiler/xla/client/lib/cholesky.cc
index e100d47922efb6655e40a19b5c4cbb2190fac6d9..414bd1494cd32f32a5c37e84119de930678a776b 100644
--- a/tensorflow/compiler/xla/client/lib/cholesky.cc
+++ b/tensorflow/compiler/xla/client/lib/cholesky.cc
@@ -68,29 +68,26 @@ XlaOp CholeskyUnblocked(XlaOp a, PrecisionConfig::Precision precision) {
     auto body_fn =
         [&](XlaOp i, absl::Span<const XlaOp> loop_vars,
             XlaBuilder* body_builder) -> StatusOr<std::vector<XlaOp>> {
-      Shape col_shape;
-      Shape row_shape;
-      for (int64 d : major_dims) {
-        row_shape.add_dimensions(d);
-        col_shape.add_dimensions(d);
-      }
-      row_shape.add_dimensions(1);
-      row_shape.add_dimensions(n);
-      row_shape.set_element_type(a_shape.element_type());
-      auto mask_zeros_row = Zeros(body_builder, row_shape);
-
-      col_shape.add_dimensions(n);
-      col_shape.add_dimensions(1);
-      col_shape.set_element_type(a_shape.element_type());
-      auto mask_zeros_col = Zeros(body_builder, col_shape);
-
-      std::vector<int32> mask_vector(n);
-      std::iota(mask_vector.begin(), mask_vector.end(), 0);
-      auto mask_range = ConstantR1<int32>(body_builder, mask_vector);
+      std::vector<int64> row_shape_dims(major_dims.begin(), major_dims.end());
+      std::vector<int64> col_shape_dims(major_dims.begin(), major_dims.end());
+      row_shape_dims.push_back(1);
+      row_shape_dims.push_back(n);
+      auto mask_zeros_row =
+          Zeros(body_builder,
+                ShapeUtil::MakeShape(a_shape.element_type(), row_shape_dims));
+
+      col_shape_dims.push_back(n);
+      col_shape_dims.push_back(1);
+      auto mask_zeros_col =
+          Zeros(body_builder,
+                ShapeUtil::MakeShape(a_shape.element_type(), col_shape_dims));
+
       auto mask_range_row =
-          Broadcast(Reshape(mask_range, {0}, {1, n}), major_dims);
+          Iota(body_builder, ShapeUtil::MakeShape(S32, row_shape_dims),
+               /*iota_dimension=*/n_dims - 1);
       auto mask_range_col =
-          Broadcast(Reshape(mask_range, {0}, {n, 1}), major_dims);
+          Iota(body_builder, ShapeUtil::MakeShape(S32, col_shape_dims),
+               /*iota_dimension=*/n_dims - 2);
       auto body_a = loop_vars[0];
       auto body_l = loop_vars[1];
 
diff --git a/tensorflow/compiler/xla/client/lib/slicing.cc b/tensorflow/compiler/xla/client/lib/slicing.cc
index 611fffba8d0544a011cb641802058c08d95cf5f7..77145ba7d4c72435450d3e33d57b2507eb84d2fc 100644
--- a/tensorflow/compiler/xla/client/lib/slicing.cc
+++ b/tensorflow/compiler/xla/client/lib/slicing.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/lib/slicing.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace xla {
 
@@ -51,17 +52,17 @@ XlaOp SliceInMinorDims(XlaOp x, absl::Span<const int64> start,
 XlaOp UpdateSlice(XlaOp x, XlaOp update, absl::Span<const int64> start) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    // TODO(phawkins): make int64 work on all backends, remove the int32 cast.
-    std::vector<int32> start_as_int32(start.begin(), start.end());
-    auto start_constant = ConstantR1<int32>(builder, start_as_int32);
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
     const int64 n_dims = shape.rank();
-    TF_ASSIGN_OR_RETURN(Shape start_constant_shape,
-                        builder->GetShape(start_constant));
-    const int64 start_length =
-        ShapeUtil::GetDimension(start_constant_shape, -1);
-    TF_RET_CHECK(start_length == n_dims);
-    return DynamicUpdateSlice(x, update, start_constant);
+    TF_RET_CHECK(start.size() == n_dims);
+
+    // TODO(phawkins): make int64 work on all backends, remove the int32 cast.
+    std::vector<int32> start_as_int32(start.begin(), start.end());
+    std::vector<XlaOp> start_ops(start.size());
+    for (int i = 0; i < start.size(); ++i) {
+      start_ops[i] = ConstantR0(builder, start_as_int32[i]);
+    }
+    return DynamicUpdateSlice(x, update, start_ops);
   });
 }
 
@@ -90,18 +91,17 @@ std::vector<int64> ConcatVectors(absl::Span<const int64> xs,
   return output;
 }
 
-XlaOp PrependZerosInMajorDims(XlaOp x, absl::Span<const XlaOp> starts) {
+StatusOr<std::vector<XlaOp>> PrependZerosInMajorDims(
+    XlaOp x, absl::Span<const XlaOp> starts) {
   XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64 n_dims = shape.rank();
-    auto zero = Reshape(ConstantR0<int32>(builder, 0), {1});
-    std::vector<XlaOp> padded_starts(n_dims, zero);
-    for (int i = 0; i < starts.size(); ++i) {
-      padded_starts[n_dims - starts.size() + i] = Reshape(starts[i], {1});
-    }
-    return ConcatInDim(builder, padded_starts, 0);
-  });
+  TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+  const int64 n_dims = shape.rank();
+  auto zero = ConstantR0<int32>(builder, 0);
+  std::vector<XlaOp> padded_starts(n_dims, zero);
+  for (int i = 0; i < starts.size(); ++i) {
+    padded_starts[n_dims - starts.size() + i] = starts[i];
+  }
+  return padded_starts;
 }
 
 }  // namespace
@@ -119,7 +119,7 @@ XlaOp DynamicSliceInMinorDims(XlaOp x, absl::Span<const XlaOp> starts,
                           .subspan(
                               /*pos=*/0,
                               /*len=*/n_dims - sizes.size());
-    auto padded_starts = PrependZerosInMajorDims(x, starts);
+    TF_ASSIGN_OR_RETURN(auto padded_starts, PrependZerosInMajorDims(x, starts));
     auto padded_sizes = ConcatVectors(major_dims, sizes);
     return DynamicSlice(x, padded_starts, padded_sizes);
   });
@@ -127,8 +127,11 @@ XlaOp DynamicSliceInMinorDims(XlaOp x, absl::Span<const XlaOp> starts,
 
 XlaOp DynamicUpdateSliceInMinorDims(XlaOp x, XlaOp update,
                                     absl::Span<const XlaOp> starts) {
-  auto padded_starts = PrependZerosInMajorDims(x, starts);
-  return DynamicUpdateSlice(x, update, padded_starts);
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto padded_starts, PrependZerosInMajorDims(x, starts));
+    return DynamicUpdateSlice(x, update, padded_starts);
+  });
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/sorting_test.cc b/tensorflow/compiler/xla/client/lib/sorting_test.cc
index 27ff36c7491ab8397d46f3a49493ff2b904deb2d..0fbd138aca1e86f219d0459086fc09d20844f135 100644
--- a/tensorflow/compiler/xla/client/lib/sorting_test.cc
+++ b/tensorflow/compiler/xla/client/lib/sorting_test.cc
@@ -77,7 +77,7 @@ XLA_TEST_F(SortingTest, TopKFullSort) {
   auto x = ConstantR1<float>(&builder, inputs);
   xla::GetTupleElement(xla::TopK(x, kSize), 0);
 
-  std::sort(inputs.begin(), inputs.end(), std::greater<float>());
+  absl::c_sort(inputs, std::greater<float>());
   ComputeAndCompareR1<float>(&builder, inputs, {});
 }
 
diff --git a/tensorflow/compiler/xla/client/lib/triangular_solve.cc b/tensorflow/compiler/xla/client/lib/triangular_solve.cc
index 6061e64656e859adcc52c250e7775a878399f4e6..c2f31742e9eff9f325fb71160b4ec3aea928d15e 100644
--- a/tensorflow/compiler/xla/client/lib/triangular_solve.cc
+++ b/tensorflow/compiler/xla/client/lib/triangular_solve.cc
@@ -165,10 +165,10 @@ XlaOp InvertDiagonalBlocks(XlaOp diag_blocks, bool lower, bool transpose_a,
     // The first or last  diagonal element should be set to 1 instead of -1
     // though, since we never update it
     auto pos_one = Reshape(One(builder, shape.element_type()), {1, 1});
-    auto start_index = (lower) ? 0 : block_size - 1;
-    auto output_block = DynamicUpdateSlice(
-        neg_identity, pos_one,
-        /*start_indices=*/ConstantR1<int>(builder, 2, start_index));
+    auto start_index = ConstantR0<int>(builder, (lower) ? 0 : block_size - 1);
+    auto output_block =
+        DynamicUpdateSlice(neg_identity, pos_one,
+                           /*start_indices=*/{start_index, start_index});
 
     // Broadcast diag([1, -1, -1, ...]) to every block
     XlaOp output = Broadcast(output_block,
@@ -211,12 +211,10 @@ XlaOp InvertDiagonalBlocks(XlaOp diag_blocks, bool lower, bool transpose_a,
       auto body_out = GetTupleElement(input_tuple, 1);
       auto body_input = GetTupleElement(input_tuple, 2);
 
-      auto zero = ConstantR1<int32>(bodyb.get(), 1, 0);
+      auto zero = ConstantR0<int32>(bodyb.get(), 0);
       auto j = (lower) ? i : ScalarLike(i, block_size - 1) - i;
-      auto start_indices =
-          ConcatInDim(bodyb.get(), {zero, Reshape(j, {1}), zero}, 0);
       auto input_row =
-          DynamicSlice(body_input, start_indices,
+          DynamicSlice(body_input, {zero, j, zero},
                        /*slice_sizes=*/{num_blocks, 1, block_size});
 
       // We want -L21 L11^{-1}
@@ -230,7 +228,7 @@ XlaOp InvertDiagonalBlocks(XlaOp diag_blocks, bool lower, bool transpose_a,
       precision_proto.add_operand_precision(precision);
       auto update = -DotGeneral(input_row, body_out, dnums, &precision_proto);
 
-      body_out = DynamicUpdateSlice(body_out, update, start_indices);
+      body_out = DynamicUpdateSlice(body_out, update, {zero, j, zero});
 
       auto next_i = i + ScalarLike(i, 1);
       Tuple(bodyb.get(), {next_i, body_out, body_input});
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index 049cd15738a619294b19d5cf74ca514d7b4a00ad..48b5f94538f453785194bc434a91ee0a10c020c2 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -164,9 +164,8 @@ StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
   //    ExecutableRunOptions.eigen_intra_op_thread_pool.
   // *) The thread pool used for XLA CPU ops is from
   //    backend_->eigen_intra_op_thread_pool().
-  ServiceExecutableRunOptions service_options(
-      run_options, backend_->StreamBorrower(),
-      backend_->eigen_intra_op_thread_pool());
+  ServiceExecutableRunOptions service_options(run_options,
+                                              backend_->StreamBorrower());
 
   if (executable_->dumping_snapshot()) {
     return ExecuteAndDump(&service_options, arguments);
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 7c72cdfeb50c552017647ef4a952e139b0b4adfa..59e156eb7292fe3c28345faa7247e08e3d46c487 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include <utility>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
@@ -29,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/sharding_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
@@ -192,9 +195,9 @@ StatusOr<ProgramShape> XlaBuilder::GetProgramShape(XlaOp root) const {
 }
 
 void XlaBuilder::IsConstantVisitor(const int64 op_handle,
-                                   std::set<int64>* visited,
+                                   absl::flat_hash_set<int64>* visited,
                                    bool* is_constant) const {
-  if (visited->count(op_handle) != 0 || !*is_constant) {
+  if (visited->contains(op_handle) || !*is_constant) {
     return;
   }
 
@@ -244,6 +247,29 @@ Status XlaBuilder::SetDynamicBinding(int64 dynamic_size_param_num,
                                      int64 target_param_num,
                                      ShapeIndex target_param_index,
                                      int64 target_dim_num) {
+  bool param_exists = false;
+  for (HloInstructionProto& instr : instructions_) {
+    if (instr.opcode() == HloOpcodeString(HloOpcode::kParameter) &&
+        instr.parameter_number() == target_param_num) {
+      param_exists = true;
+      Shape param_shape(instr.shape());
+      Shape* param_shape_ptr = &param_shape;
+      for (int64 index : target_param_index) {
+        param_shape_ptr = param_shape_ptr->mutable_tuple_shapes(index);
+      }
+      param_shape_ptr->set_dynamic_dimension(target_dim_num,
+                                             /*is_dynamic=*/true);
+      *instr.mutable_shape() = param_shape.ToProto();
+    }
+  }
+
+  if (!param_exists) {
+    return InvalidArgument(
+        "Asked to mark parameter %lld as dynamic sized parameter, but the "
+        "doesn't exists",
+        target_param_num);
+  }
+
   TF_RETURN_IF_ERROR(dynamic_parameter_binding_.Bind(
       DynamicParameterBinding::DynamicParameter{dynamic_size_param_num,
                                                 dynamic_size_param_index},
@@ -310,7 +336,10 @@ StatusOr<XlaComputation> XlaBuilder::Build(int64 root_id) {
     module->add_computations()->Swap(&e.second);
   }
   module->add_computations()->Swap(&entry);
-
+  if (!input_output_aliases_.empty()) {
+    TF_RETURN_IF_ERROR(
+        PopulateInputOutputAlias(module, program_shape, input_output_aliases_));
+  }
   *(module->mutable_dynamic_parameter_binding()) =
       dynamic_parameter_binding_.ToProto();
 
@@ -323,6 +352,35 @@ StatusOr<XlaComputation> XlaBuilder::Build(int64 root_id) {
   return std::move(computation);
 }
 
+/* static */ Status XlaBuilder::PopulateInputOutputAlias(
+    HloModuleProto* module, const ProgramShape& program_shape,
+    const std::vector<InputOutputAlias>& input_output_aliases) {
+  HloInputOutputAliasConfig config(program_shape.result());
+  for (auto& alias : input_output_aliases) {
+    // The HloInputOutputAliasConfig does not do parameter validation as it only
+    // carries the result shape. Maybe it should be constructed with a
+    // ProgramShape to allow full validation. We will still get an error when
+    // trying to compile the HLO module, but would be better to have validation
+    // at this stage.
+    if (alias.param_number >= program_shape.parameters_size()) {
+      return InvalidArgument("Invalid parameter number %ld (total %ld)",
+                             alias.param_number,
+                             program_shape.parameters_size());
+    }
+    const Shape& parameter_shape = program_shape.parameters(alias.param_number);
+    if (!ShapeUtil::IndexIsValid(parameter_shape, alias.param_index)) {
+      return InvalidArgument("Invalid parameter %ld index: %s",
+                             alias.param_number,
+                             alias.param_index.ToString().c_str());
+    }
+    TF_RETURN_IF_ERROR(config.SetUpAlias(
+        alias.output_index, alias.param_number, alias.param_index,
+        HloInputOutputAliasConfig::AliasKind::kUserAlias));
+  }
+  *module->mutable_input_output_alias() = config.ToProto();
+  return Status::OK();
+}
+
 StatusOr<XlaOp> XlaBuilder::InDimBroadcast(
     const Shape& shape, const XlaOp& operand,
     absl::Span<const int64> broadcast_dimensions) {
@@ -659,7 +717,7 @@ XlaOp XlaBuilder::DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
                         GetShape(start_indices));
     TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferDynamicSliceShape(
-                            operand_shape, start_indices_shape, slice_sizes));
+                            operand_shape, {start_indices_shape}, slice_sizes));
     *instr.mutable_shape() = shape.ToProto();
 
     for (int64 size : slice_sizes) {
@@ -671,6 +729,34 @@ XlaOp XlaBuilder::DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
   });
 }
 
+XlaOp XlaBuilder::DynamicSlice(const XlaOp& operand,
+                               absl::Span<const XlaOp> start_indices,
+                               absl::Span<const int64> slice_sizes) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    std::vector<const Shape*> start_indices_shape_ptrs;
+    TF_ASSIGN_OR_RETURN(const auto& start_indices_shapes,
+                        GetOperandShapes(start_indices));
+    absl::c_transform(start_indices_shapes,
+                      std::back_inserter(start_indices_shape_ptrs),
+                      [](const Shape& shape) { return &shape; });
+    TF_ASSIGN_OR_RETURN(Shape shape,
+                        ShapeInference::InferDynamicSliceShape(
+                            operand_shape, start_indices_shapes, slice_sizes));
+    *instr.mutable_shape() = shape.ToProto();
+
+    for (int64 size : slice_sizes) {
+      instr.add_dynamic_slice_sizes(size);
+    }
+
+    std::vector<XlaOp> operands = {operand};
+    operands.insert(operands.end(), start_indices.begin(), start_indices.end());
+    return AddInstruction(std::move(instr), HloOpcode::kDynamicSlice, operands);
+  });
+}
+
 XlaOp XlaBuilder::DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
                                      const XlaOp& start_indices) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -680,13 +766,38 @@ XlaOp XlaBuilder::DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
     TF_ASSIGN_OR_RETURN(const Shape& update_shape, GetShape(update));
     TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape,
                         GetShape(start_indices));
+    TF_ASSIGN_OR_RETURN(
+        Shape shape, ShapeInference::InferDynamicUpdateSliceShape(
+                         operand_shape, update_shape, {start_indices_shape}));
+    *instr.mutable_shape() = shape.ToProto();
+
+    return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice,
+                          {operand, update, start_indices});
+  });
+}
+
+XlaOp XlaBuilder::DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+                                     absl::Span<const XlaOp> start_indices) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& update_shape, GetShape(update));
+    std::vector<const Shape*> start_indices_shape_ptrs;
+    TF_ASSIGN_OR_RETURN(const auto& start_indices_shapes,
+                        GetOperandShapes(start_indices));
+    absl::c_transform(start_indices_shapes,
+                      std::back_inserter(start_indices_shape_ptrs),
+                      [](const Shape& shape) { return &shape; });
     TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferDynamicUpdateSliceShape(
-                            operand_shape, update_shape, start_indices_shape));
+                            operand_shape, update_shape, start_indices_shapes));
     *instr.mutable_shape() = shape.ToProto();
 
+    std::vector<XlaOp> operands = {operand, update};
+    operands.insert(operands.end(), start_indices.begin(), start_indices.end());
     return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice,
-                          {operand, update, start_indices});
+                          operands);
   });
 }
 
@@ -2383,7 +2494,7 @@ StatusOr<bool> XlaBuilder::IsConstant(const XlaOp& operand) const {
   TF_RETURN_IF_ERROR(LookUpInstruction(operand).status());
 
   bool is_constant = true;
-  std::set<int64> visited;
+  absl::flat_hash_set<int64> visited;
   IsConstantVisitor(operand.handle(), &visited, &is_constant);
   return is_constant;
 }
@@ -2717,12 +2828,21 @@ XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
                    absl::Span<const int64> slice_sizes) {
   return operand.builder()->DynamicSlice(operand, start_indices, slice_sizes);
 }
+XlaOp DynamicSlice(const XlaOp& operand, absl::Span<const XlaOp> start_indices,
+                   absl::Span<const int64> slice_sizes) {
+  return operand.builder()->DynamicSlice(operand, start_indices, slice_sizes);
+}
 
 XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
                          const XlaOp& start_indices) {
   return operand.builder()->DynamicUpdateSlice(operand, update, start_indices);
 }
 
+XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+                         absl::Span<const XlaOp> start_indices) {
+  return operand.builder()->DynamicUpdateSlice(operand, update, start_indices);
+}
+
 XlaOp ConcatInDim(XlaBuilder* builder, absl::Span<const XlaOp> operands,
                   int64 dimension) {
   return builder->ConcatInDim(operands, dimension);
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 6e9b025e5d70c03e9f4c7e7fbc89976f314d48d7..8908d172fa89632ead48f954de12066af12411c7 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -276,7 +276,22 @@ class XlaBuilder {
                            int64 target_param_num,
                            ShapeIndex target_param_index, int64 target_dim_num);
 
+  // Adds a new input/output alias. Since the input/ouput shape information are
+  // not available until the computation is built, and eventual error in the
+  // arguments of this API will be detected only at computation Build() time.
+  void SetUpAlias(const ShapeIndex& output_index, int64 param_number,
+                  const ShapeIndex& param_index) {
+    input_output_aliases_.push_back({output_index, param_number, param_index});
+  }
+
  private:
+  // Describes an input/output alias as inserted by the SetUpAlias() API.
+  struct InputOutputAlias {
+    ShapeIndex output_index;
+    int64 param_number;
+    ShapeIndex param_index;
+  };
+
   // Build helper which takes the id of the root operation..
   StatusOr<XlaComputation> Build(int64 root_id);
 
@@ -344,11 +359,18 @@ class XlaBuilder {
   XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index,
                    int64 stride, int64 dimno);
 
+  ABSL_DEPRECATED("Use span-of-indices form instead")
   XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
                      absl::Span<const int64> slice_sizes);
+  XlaOp DynamicSlice(const XlaOp& operand,
+                     absl::Span<const XlaOp> start_indices,
+                     absl::Span<const int64> slice_sizes);
 
+  ABSL_DEPRECATED("Use span-of-indices form instead")
   XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
                            const XlaOp& start_indices);
+  XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+                           absl::Span<const XlaOp> start_indices);
 
   XlaOp ConcatInDim(absl::Span<const XlaOp> operands, int64 dimension);
 
@@ -712,7 +734,8 @@ class XlaBuilder {
   // operation such as `RngNormal` or `Infeed`. The visitor walks the
   // computation starting at a given operation and sets is_constant to false iff
   // a parameter or stateful operation is encountered.
-  void IsConstantVisitor(const int64 op_handle, std::set<int64>* visited,
+  void IsConstantVisitor(const int64 op_handle,
+                         absl::flat_hash_set<int64>* visited,
                          bool* is_constant) const;
 
   // Checks bounds for convolution parameters.
@@ -730,6 +753,12 @@ class XlaBuilder {
 
   int64 GetNextId() { return ++next_id_; }
 
+  // Populates the module with the input/output alias information stored within
+  // the input_output_aliases vector.
+  static Status PopulateInputOutputAlias(
+      HloModuleProto* module, const ProgramShape& program_shape,
+      const std::vector<InputOutputAlias>& input_output_aliases);
+
   string name_;  // Name to use for the built computation.
 
   // The next sequential ID for every instruction/computation contained within
@@ -749,6 +778,9 @@ class XlaBuilder {
   // Dynamic parameter configuration of this computation.
   DynamicParameterBinding dynamic_parameter_binding_;
 
+  // Holds the input/output alias information populated by the SetUpAlias() API.
+  std::vector<InputOutputAlias> input_output_aliases_;
+
   // A map from XlaOp::Handle to the index in the instructions_ vector where the
   // instruction is held.
   absl::flat_hash_map<int64, int64> handle_to_index_;
@@ -850,9 +882,14 @@ class XlaBuilder {
 
   friend XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
                             absl::Span<const int64> slice_sizes);
+  friend XlaOp DynamicSlice(const XlaOp& operand,
+                            absl::Span<const XlaOp> start_indices,
+                            absl::Span<const int64> slice_sizes);
 
   friend XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
                                   const XlaOp& start_indices);
+  friend XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+                                  absl::Span<const XlaOp> start_indices);
 
   friend XlaOp ConcatInDim(XlaBuilder* builder,
                            absl::Span<const XlaOp> operands, int64 dimension);
@@ -1294,10 +1331,15 @@ XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index,
 // The size of the slice in each dimension is passed in 'slice_sizes',
 // which specify the end point of exclusive slice intervals in each
 // dimension [start, start + size).
-// The shape of 'start_indices' must be rank == 1, with dimension size
-// equal to the rank of the 'operand'.
+// The shape of each element of 'start_indices' must be scalar, with the span
+// size equal to the rank of the 'operand'. All elements of 'start_indices' must
+// have the same shape.
 // Slice index calculations are computed modulo input dimension sizes to
 // prevent dynamic start indices from generating out-of-bound array accesses.
+XlaOp DynamicSlice(const XlaOp& operand, absl::Span<const XlaOp> start_indices,
+                   absl::Span<const int64> slice_sizes);
+
+ABSL_DEPRECATED("Use span-of-indices form instead")
 XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
                    absl::Span<const int64> slice_sizes);
 
@@ -1313,10 +1355,15 @@ XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
 //   [4 5 6]  => DynamicUpdateslice(data, update, start)   => [4 10 11]
 //   [7 8 9]                                                  [7 8  9 ]
 //
-// The shape of 'start_indices' must be rank == 1, with dimension size
-// equal to the rank of the 'operand'.
+// The shape of each element of 'start_indices' must be scalar, with the span
+// size equal to the rank of the 'operand'. All elements of 'start_indices' must
+// have the same shape.
 // Slice index calculations are computed modulo update dimension sizes to
 // prevent dynamic start indices from generating out-of-bound array accesses.
+XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+                         absl::Span<const XlaOp> start_indices);
+
+ABSL_DEPRECATED("Use span-of-indices form instead")
 XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
                          const XlaOp& start_indices);
 
diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index b3f5be300d3f15397ad33858a6a9cab5f6029688..abc11b4732dd1c041d9c78aa7cac40129d82a785 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -446,6 +446,26 @@ TEST_F(XlaBuilderTest, ProtoMatches) {
   EXPECT_EQ(c0_string, c1_string);
 }
 
+TEST_F(XlaBuilderTest, DynamicParameter) {
+  std::vector<XlaComputation> computations;
+  XlaBuilder b("builder");
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5}), ShapeUtil::MakeShape(F32, {6})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  Parameter(&b, 1, ShapeUtil::MakeShape(U32, {}), "p1");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/1,
+                                   /*dynamic_size_param_index=*/{},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{1},
+                                   /*target_dim_num=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b, /*root=*/p0));
+  const Shape& param_shape = module->entry_computation()
+                                 ->parameter_instruction(0)
+                                 ->shape()
+                                 .tuple_shapes(1);
+  EXPECT_TRUE(param_shape.is_dynamic_dimension(0));
+}
+
 TEST_F(XlaBuilderTest, AfterAllWithNonTokenOperands) {
   XlaBuilder b(TestName());
   AfterAll(&b, {CreateToken(&b), ConstantR0<float>(&b, 1.0)});
@@ -455,5 +475,31 @@ TEST_F(XlaBuilderTest, AfterAllWithNonTokenOperands) {
               ::testing::HasSubstr("All operands to AfterAll must be tokens"));
 }
 
+TEST_F(XlaBuilderTest, CheckInputOutputAlias) {
+  XlaBuilder b(TestName());
+  auto p0 = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {8, 4}), "p0");
+  auto p1 = Parameter(&b, 1, ShapeUtil::MakeShape(F32, {8, 4}), "p1");
+  auto add = Add(p0, p1);
+  auto sub = Sub(p0, p1);
+  auto root = Tuple(&b, {add, sub});
+
+  b.SetUpAlias({1}, 0, {});
+  b.SetUpAlias({0}, 1, {});
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b, root));
+
+  const HloInputOutputAliasConfig& config = module->input_output_alias_config();
+  EXPECT_TRUE(config.ParameterHasAlias(0, {}));
+  EXPECT_TRUE(config.ParameterHasAlias(1, {}));
+
+  auto alias_p0 = config.GetAliasedOutput(0, {});
+  ASSERT_TRUE(alias_p0.has_value());
+  EXPECT_EQ(*alias_p0, ShapeIndex({1}));
+
+  auto alias_p1 = config.GetAliasedOutput(1, {});
+  ASSERT_TRUE(alias_p1.has_value());
+  EXPECT_EQ(*alias_p1, ShapeIndex({0}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index 9a9cd08c301502cbda8858225182d95fca4bf7ae..59ba9bb6584c580b237fc2de126db58453854581 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -871,9 +871,7 @@ DotGeneral performs the sum of products over contracting dimensions specified
 in 'dimension_numbers'.
 
 Associated contracting dimension numbers from the 'lhs' and 'rhs' do not need
-to be the same, but must be listed in the same order in both
-'lhs/rhs_contracting_dimensions' arrays and have the same dimension sizes.
-There must be exactly one contracting dimension on both 'lhs' and 'rhs'.
+to be the same and but must have the same dimension sizes.
 
 Example with contracting dimension numbers:
 
@@ -892,10 +890,8 @@ DotGeneral(lhs, rhs, dnums) -> { {6.0, 12.0},
 {15.0, 30.0} }
 ```
 
-Associated batch dimension numbers from the 'lhs' and 'rhs' must have the same
-dimension number, must be listed in the same order in both arrays, must
-have the same dimension sizes, and must be ordered before contracting and
-non-contracting/non-batch dimension numbers.
+Associated batch dimension numbers from the 'lhs' and 'rhs' must
+have the same dimension sizes.
 
 Example with batch dimension numbers (batch size 2, 2x2 matrices):
 
@@ -944,21 +940,21 @@ dimension: [start, start + size). The shape of `start_indices` must be rank ==
 
 <b> `DynamicSlice(operand, start_indices, size_indices)` </b>
 
-| Arguments       | Type                | Semantics                           |
-| --------------- | ------------------- | ----------------------------------- |
-| `operand`       | `XlaOp`             | N dimensional array of type T       |
-| `start_indices` | `XlaOp`             | Rank 1 array of N integers          |
-:                 :                     : containing the starting indices of  :
-:                 :                     : the slice for each dimension. Value :
-:                 :                     : must be greater than or equal to    :
-:                 :                     : zero.                               :
-| `size_indices`  | `ArraySlice<int64>` | List of N integers containing the   |
-:                 :                     : slice size for each dimension. Each :
-:                 :                     : value must be strictly greater than :
-:                 :                     : zero, and start + size must be less :
-:                 :                     : than or equal to the size of the    :
-:                 :                     : dimension to avoid wrapping modulo  :
-:                 :                     : dimension size.                     :
+| Arguments       | Type                  | Semantics                          |
+| --------------- | --------------------- | ---------------------------------- |
+| `operand`       | `XlaOp`               | N dimensional array of type T      |
+| `start_indices` | sequence of N `XlaOp` | List of N scalar integers          |
+:                 :                       : containing the starting indices of :
+:                 :                       : the slice for each dimension.      :
+:                 :                       : Value must be greater than or      :
+:                 :                       : equal to zero.                     :
+| `size_indices`  | `ArraySlice<int64>`   | List of N integers containing the  |
+:                 :                       : slice size for each dimension.     :
+:                 :                       : Each value must be strictly        :
+:                 :                       : greater than zero, and start +     :
+:                 :                       : size must be less than or equal to :
+:                 :                       : the size of the dimension to avoid :
+:                 :                       : wrapping modulo dimension size.    :
 
 The effective slice indices are computed by applying the following
 transformation for each index `i` in `[1, N)` before performing the slice:
@@ -1009,19 +1005,22 @@ the rank of `operand`.
 
 <b> `DynamicUpdateSlice(operand, update, start_indices)` </b>
 
-| Arguments       | Type    | Semantics                                        |
-| --------------- | ------- | ------------------------------------------------ |
-| `operand`       | `XlaOp` | N dimensional array of type T                    |
-| `update`        | `XlaOp` | N dimensional array of type T containing the     |
-:                 :         : slice update. Each dimension of update shape     :
-:                 :         : must be strictly greater than zero, and start +  :
-:                 :         : update must be less than or equal to the operand :
-:                 :         : size for each dimension to avoid generating      :
-:                 :         : out-of-bounds update indices.                    :
-| `start_indices` | `XlaOp` | Rank 1 array of N integers containing the        |
-:                 :         : starting indices of the slice for each           :
-:                 :         : dimension. Value must be greater than or equal   :
-:                 :         : to zero.                                         :
+| Arguments       | Type                  | Semantics                          |
+| --------------- | --------------------- | ---------------------------------- |
+| `operand`       | `XlaOp`               | N dimensional array of type T      |
+| `update`        | `XlaOp`               | N dimensional array of type T      |
+:                 :                       : containing the slice update. Each  :
+:                 :                       : dimension of update shape must be  :
+:                 :                       : strictly greater than zero, and    :
+:                 :                       : start + update must be less than   :
+:                 :                       : or equal to the operand size for   :
+:                 :                       : each dimension to avoid generating :
+:                 :                       : out-of-bounds update indices.      :
+| `start_indices` | sequence of N `XlaOp` | List of N scalar integers          |
+:                 :                       : containing the starting indices of :
+:                 :                       : the slice for each dimension.      :
+:                 :                       : Value must be greater than or      :
+:                 :                       : equal to zero.                     :
 
 The effective slice indices are computed by applying the following
 transformation for each index `i` in `[1, N)` before performing the slice:
@@ -1095,7 +1094,7 @@ When `Op` is `Rem`, the sign of the result is taken from the dividend, and the
 absolute value of the result is always less than the divisor's absolute value.
 
 Integer division overflow (signed/unsigned division/remainder by zero or signed
-divison/remainder of `INT_SMIN` with `-1`) produces an implementation defined
+division/remainder of `INT_SMIN` with `-1`) produces an implementation defined
 value.
 
 An alternative variant with different-rank broadcasting support exists for these
diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc
index 42649d669222e6a871c81a01c54148c93f12a140..2fe9b56c6bdffb931726f60ab75081361b43ebb4 100644
--- a/tensorflow/compiler/xla/layout_util.cc
+++ b/tensorflow/compiler/xla/layout_util.cc
@@ -290,8 +290,8 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 /* static */ bool LayoutUtil::HasLayout(const Shape& shape) {
   if (shape.IsTuple()) {
     // Tuple shape: all subshapes must have a layout.
-    return std::all_of(shape.tuple_shapes().begin(), shape.tuple_shapes().end(),
-                       [](const Shape& s) { return HasLayout(s); });
+    return absl::c_all_of(shape.tuple_shapes(),
+                          [](const Shape& s) { return HasLayout(s); });
   } else if (!shape.IsArray()) {
     // Opaque, token types etc. ignore layout.
     return true;
@@ -424,7 +424,7 @@ Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src, Shape* dst) {
     positions_in_layout.push_back(
         PositionInContainer(layout.minor_to_major(), dim));
   }
-  std::sort(positions_in_layout.begin(), positions_in_layout.end());
+  absl::c_sort(positions_in_layout);
   for (size_t i = 1; i < positions_in_layout.size(); ++i) {
     if (1 != positions_in_layout[i] - positions_in_layout[i - 1]) {
       return false;
diff --git a/tensorflow/compiler/xla/literal_comparison.cc b/tensorflow/compiler/xla/literal_comparison.cc
index f4376d66af8b1eb069f5f711e4a619df9b0b7de2..258bc966b1a2ee3faa75e3319185489a107b3461 100644
--- a/tensorflow/compiler/xla/literal_comparison.cc
+++ b/tensorflow/compiler/xla/literal_comparison.cc
@@ -387,7 +387,14 @@ class NearComparator {
       rel_error = std::numeric_limits<float>::infinity();
     } else {
       abs_error = FpAbsoluteValue(actual - expected);
-      rel_error = abs_error / FpAbsoluteValue(expected);
+
+      // Avoid division by 0 even though it's well-defined because ubsan can be
+      // configured to treat this as a fatal error.
+      if (expected != T{0}) {
+        rel_error = abs_error / FpAbsoluteValue(expected);
+      } else {
+        rel_error = std::numeric_limits<float>::infinity();
+      }
     }
     const bool is_abs_mismatch = abs_error > error_.abs;
     const bool is_rel_mismatch = rel_error > error_.rel;
diff --git a/tensorflow/compiler/xla/metric_table_report.cc b/tensorflow/compiler/xla/metric_table_report.cc
index 4eab4fa4290c270697c00be20840cf4e85459183..ad1699a1ae65180d56617b069d8b2e1d7d81c38c 100644
--- a/tensorflow/compiler/xla/metric_table_report.cc
+++ b/tensorflow/compiler/xla/metric_table_report.cc
@@ -55,7 +55,7 @@ string MetricTableReport::MakeReport(double expected_metric_sum) {
   const auto metric_greater = [](const Entry& a, const Entry& b) {
     return a.metric > b.metric;
   };
-  std::sort(entries_.begin(), entries_.end(), metric_greater);
+  absl::c_sort(entries_, metric_greater);
 
   // Create the report
   AppendLine();
@@ -117,7 +117,7 @@ std::vector<MetricTableReport::Category> MetricTableReport::MakeCategories(
   auto metric_sum_greater = [](const Category& a, const Category& b) {
     return a.metric_sum > b.metric_sum;
   };
-  std::sort(categories.begin(), categories.end(), metric_sum_greater);
+  absl::c_sort(categories, metric_sum_greater);
 
   return categories;
 }
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index b0f0e9e57060f43e464ea2b2e20b9000679d18b6..c153603105a48c473572c51013f493a45c20c4b1 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -927,6 +927,22 @@ LocalOp LocalComputationBuilder::TriangularSolve(const LocalOp& a,
                               conjugate_a);
 }
 
+LocalOp LocalComputationBuilder::Gather(
+    const LocalOp& input, const LocalOp& start_indices,
+    const GatherDimensionNumbers& dimension_numbers,
+    absl::Span<const int64> slice_sizes) {
+  return xla::Gather(input.op(), start_indices.op(), dimension_numbers,
+                     slice_sizes);
+}
+
+LocalOp LocalComputationBuilder::Scatter(
+    const LocalOp& input, const LocalOp& scatter_indices,
+    const LocalOp& updates, const LocalComputation& update_computation,
+    const ScatterDimensionNumbers& dimension_numbers) {
+  return xla::Scatter(input.op(), scatter_indices.op(), updates.op(),
+                      update_computation.computation(), dimension_numbers);
+}
+
 StatusOr<LocalComputation*> LocalComputationBuilder::BuildConstantSubGraph(
     const LocalOp& operand) {
   TF_ASSIGN_OR_RETURN(XlaComputation computation,
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index 5e8341592100bc1eba4d1c17b0c2dd0e0888fdb1..98759cf984751d2cef8df4449d392ace786a8ebc 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -418,6 +418,15 @@ class LocalComputationBuilder {
   LocalOp TriangularSolve(const LocalOp& a, const LocalOp& b, bool left_side,
                           bool lower, bool transpose_a, bool conjugate_a);
 
+  LocalOp Gather(const LocalOp& input, const LocalOp& start_indices,
+                 const GatherDimensionNumbers& dimension_numbers,
+                 absl::Span<const int64> slice_sizes);
+
+  LocalOp Scatter(const LocalOp& input, const LocalOp& scatter_indices,
+                  const LocalOp& updates,
+                  const LocalComputation& update_computation,
+                  const ScatterDimensionNumbers& dimension_numbers);
+
   StatusOr<LocalComputation*> BuildConstantSubGraph(const LocalOp& operand);
 
 #define _FORWARD(method_name, return_sig, args_sig) \
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index bf5d667c6a12972845735983a74264ea05675971..66ecee5c4d44663838055065d9884a87a512b30b 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -34,6 +34,8 @@ limitations under the License.
 //  PaddingConfig proto                <-  corresponding Python proto
 //  ConvolutionDimensionNumbers proto  <-  corresponding Python proto
 //  DotDimensionNumbers proto          <-  corresponding Python proto
+//  GatherDimensionNumbers proto       <-  corresponding Python proto
+//  ScatterDimensionNumbers proto      <-  corresponding Python proto
 //
 // Arrows indicate whether a conversion only ever occurs in one
 // direction, or whether it is maintained bidirectionally.
@@ -167,8 +169,41 @@ bool HandleStringAttribute(PyObject* o,
   return true;  // Handled string attribute, ok!
 }
 
+bool HandleRepeatedInt64Attribute(
+    PyObject* o, const char* attr_name,
+    tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>* field) {
+  PyObject* seq = PyObject_GetAttrString(o, attr_name);
+  if (!seq) {
+    return false;
+  }
+
+  int length = PySequence_Size(seq);
+  if (length == -1) {
+    Py_DECREF(seq);
+    return false;
+  }
+
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(seq, i);
+    if (!item) {
+      Py_DECREF(seq);
+      return false;
+    }
+    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
+    if (dimension == -1 && PyErr_Occurred()) {
+      Py_DECREF(item);
+      Py_DECREF(seq);
+      return false;
+    }
+    *field->Add() = dimension;
+    Py_DECREF(item);
+  }
+  Py_DECREF(seq);
+  return true;
 }
-}
+
+}  // namespace swig
+}  // namespace xla
 %}
 
 // Required to use PyArray_* functions.
@@ -657,128 +692,27 @@ tensorflow::ImportNumpy();
 
 %typemap(in) const DotDimensionNumbers&
     (DotDimensionNumbers dimension_numbers) {
-  int length;
-
-  /* lhs_contracting_dimensions */
-  PyObject* lhs_contracting_dimensions = PyObject_GetAttrString(
-      $input, "lhs_contracting_dimensions");
-  if (!lhs_contracting_dimensions) {
-    SWIG_fail;
-  }
-
-  length = PySequence_Size(lhs_contracting_dimensions);
-  if (length == -1) {
-    Py_DECREF(lhs_contracting_dimensions);
-    SWIG_fail;
-  }
-
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(lhs_contracting_dimensions, i);
-    if (!item) {
-      Py_DECREF(lhs_contracting_dimensions);
-      SWIG_fail;
-    }
-    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
-    if (dimension == -1 && PyErr_Occurred()) {
-      Py_DECREF(item);
-      Py_DECREF(lhs_contracting_dimensions);
-      SWIG_fail;
-    }
-    dimension_numbers.add_lhs_contracting_dimensions(dimension);
-    Py_DECREF(item);
-  }
-  Py_DECREF(lhs_contracting_dimensions);
-
-  /* rhs_contracting_dimensions */
-  PyObject* rhs_contracting_dimensions = PyObject_GetAttrString(
-      $input, "rhs_contracting_dimensions");
-  if (!lhs_contracting_dimensions) {
+  if (!HandleRepeatedInt64Attribute(
+        $input, "lhs_contracting_dimensions",
+        dimension_numbers.mutable_lhs_contracting_dimensions())) {
     SWIG_fail;
   }
-
-  length = PySequence_Size(rhs_contracting_dimensions);
-  if (length == -1) {
-    Py_DECREF(rhs_contracting_dimensions);
+  if (!HandleRepeatedInt64Attribute(
+        $input, "rhs_contracting_dimensions",
+        dimension_numbers.mutable_rhs_contracting_dimensions())) {
     SWIG_fail;
   }
-
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(rhs_contracting_dimensions, i);
-    if (!item) {
-      Py_DECREF(rhs_contracting_dimensions);
-      SWIG_fail;
-    }
-    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
-    if (dimension == -1 && PyErr_Occurred()) {
-      Py_DECREF(item);
-      Py_DECREF(rhs_contracting_dimensions);
-      SWIG_fail;
-    }
-    dimension_numbers.add_rhs_contracting_dimensions(dimension);
-    Py_DECREF(item);
-  }
-  Py_DECREF(rhs_contracting_dimensions);
-
-  /* lhs_batch_dimensions */
-  PyObject* lhs_batch_dimensions = PyObject_GetAttrString(
-      $input, "lhs_batch_dimensions");
-  if (!lhs_batch_dimensions) {
+  if (!HandleRepeatedInt64Attribute(
+        $input, "lhs_batch_dimensions",
+        dimension_numbers.mutable_lhs_batch_dimensions())) {
     SWIG_fail;
   }
-
-  length = PySequence_Size(lhs_batch_dimensions);
-  if (length == -1) {
-    Py_DECREF(lhs_batch_dimensions);
-    SWIG_fail;
-  }
-
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(lhs_batch_dimensions, i);
-    if (!item) {
-      Py_DECREF(lhs_batch_dimensions);
-      SWIG_fail;
-    }
-    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
-    if (dimension == -1 && PyErr_Occurred()) {
-      Py_DECREF(item);
-      Py_DECREF(lhs_batch_dimensions);
-      SWIG_fail;
-    }
-    dimension_numbers.add_lhs_batch_dimensions(dimension);
-    Py_DECREF(item);
-  }
-  Py_DECREF(lhs_batch_dimensions);
-
-  /* rhs_batch_dimensions */
-  PyObject* rhs_batch_dimensions = PyObject_GetAttrString(
-      $input, "rhs_batch_dimensions");
-  if (!rhs_batch_dimensions) {
-    SWIG_fail;
-  }
-
-  length = PySequence_Size(rhs_batch_dimensions);
-  if (length == -1) {
-    Py_DECREF(rhs_batch_dimensions);
+  if (!HandleRepeatedInt64Attribute(
+        $input, "rhs_batch_dimensions",
+        dimension_numbers.mutable_rhs_batch_dimensions())) {
     SWIG_fail;
   }
 
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(rhs_batch_dimensions, i);
-    if (!item) {
-      Py_DECREF(rhs_batch_dimensions);
-      SWIG_fail;
-    }
-    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
-    if (dimension == -1 && PyErr_Occurred()) {
-      Py_DECREF(item);
-      Py_DECREF(rhs_batch_dimensions);
-      SWIG_fail;
-    }
-    dimension_numbers.add_rhs_batch_dimensions(dimension);
-    Py_DECREF(item);
-  }
-  Py_DECREF(rhs_batch_dimensions);
-
   $1 = &dimension_numbers;
 }
 
@@ -861,85 +795,80 @@ tensorflow::ImportNumpy();
   dimension_numbers.set_kernel_input_feature_dimension(value);
 
   PyObject* o;
-  int length;
 
-  o = PyObject_GetAttrString($input, "input_spatial_dimensions");
-  if (!o) {
+  if (!HandleRepeatedInt64Attribute(
+        $input, "input_spatial_dimensions",
+        dimension_numbers.mutable_input_spatial_dimensions())) {
     SWIG_fail;
   }
-  length = PySequence_Size(o);
-  if (length == -1) {
-    Py_DECREF(o);
+  if (!HandleRepeatedInt64Attribute(
+        $input, "kernel_spatial_dimensions",
+        dimension_numbers.mutable_kernel_spatial_dimensions())) {
     SWIG_fail;
   }
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(o, i);
-    if (!item) {
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
-    if (dimension == -1 && PyErr_Occurred()) {
-      Py_DECREF(item);
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    dimension_numbers.add_input_spatial_dimensions(dimension);
-    Py_DECREF(item);
+  if (!HandleRepeatedInt64Attribute(
+        $input, "output_spatial_dimensions",
+        dimension_numbers.mutable_output_spatial_dimensions())) {
+    SWIG_fail;
   }
-  Py_DECREF(o);
 
-  o = PyObject_GetAttrString($input, "kernel_spatial_dimensions");
-  if (!o) {
+  $1 = &dimension_numbers;
+}
+
+// GatherDimensionNumbers
+
+%typemap(in) const GatherDimensionNumbers&
+    (GatherDimensionNumbers dimension_numbers) {
+  if (!HandleRepeatedInt64Attribute(
+        $input, "offset_dims",
+        dimension_numbers.mutable_offset_dims())) {
     SWIG_fail;
   }
-  length = PySequence_Size(o);
-  if (length == -1) {
-    Py_DECREF(o);
+  if (!HandleRepeatedInt64Attribute(
+        $input, "collapsed_slice_dims",
+        dimension_numbers.mutable_collapsed_slice_dims())) {
     SWIG_fail;
   }
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(o, i);
-    if (!item) {
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
-    if (dimension == -1 && PyErr_Occurred()) {
-      Py_DECREF(item);
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    dimension_numbers.add_kernel_spatial_dimensions(dimension);
-    Py_DECREF(item);
+  if (!HandleRepeatedInt64Attribute(
+        $input, "start_index_map",
+        dimension_numbers.mutable_start_index_map())) {
+    SWIG_fail;
   }
-  Py_DECREF(o);
 
-  o = PyObject_GetAttrString($input, "output_spatial_dimensions");
-  if (!o) {
+  int64 value;
+  if (!GetIntAttr($input, "index_vector_dim", &value)) {
     SWIG_fail;
   }
-  length = PySequence_Size(o);
-  if (length == -1) {
-    Py_DECREF(o);
+  dimension_numbers.set_index_vector_dim(value);
+
+  $1 = &dimension_numbers;
+}
+
+// ScatterDimensionNumbers
+
+%typemap(in) const ScatterDimensionNumbers&
+    (ScatterDimensionNumbers dimension_numbers) {
+  if (!HandleRepeatedInt64Attribute(
+        $input, "update_window_dims",
+        dimension_numbers.mutable_update_window_dims())) {
     SWIG_fail;
   }
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(o, i);
-    if (!item) {
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
-    if (dimension == -1 && PyErr_Occurred()) {
-      Py_DECREF(item);
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    dimension_numbers.add_output_spatial_dimensions(dimension);
-    Py_DECREF(item);
+  if (!HandleRepeatedInt64Attribute(
+        $input, "inserted_window_dims",
+        dimension_numbers.mutable_inserted_window_dims())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "scatter_dims_to_operand_dims",
+        dimension_numbers.mutable_scatter_dims_to_operand_dims())) {
+    SWIG_fail;
+  }
+
+  int64 value;
+  if (!GetIntAttr($input, "index_vector_dim", &value)) {
+    SWIG_fail;
   }
-  Py_DECREF(o);
+  dimension_numbers.set_index_vector_dim(value);
 
   $1 = &dimension_numbers;
 }
@@ -1151,6 +1080,8 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::QR;
 %unignore xla::swig::LocalComputationBuilder::TriangularSolve;
 %unignore xla::swig::LocalComputationBuilder::CustomCall;
+%unignore xla::swig::LocalComputationBuilder::Gather;
+%unignore xla::swig::LocalComputationBuilder::Scatter;
 %unignore xla::swig::DeleteLocalComputation;
 %unignore xla::swig::DestructureLocalShapedBufferTuple;
 %unignore xla::swig::DestructureXrtAllocationTuple;
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 378bbdcb175f10d73da87f5286cf5129477a124c..4e71121c097e1eca2d7fbd3299b17e06dd8b8e39 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -1477,6 +1477,18 @@ class ComputationBuilder(object):
     return self._client.TriangularSolve(
         a, b, left_side, lower, transpose_a, conjugate_a)
 
+  def Gather(self, a, start_indices, dimension_numbers, slice_sizes):
+    """Enqueues a Gather operation onto the computation."""
+    return self._client.Gather(a, start_indices, dimension_numbers,
+                               slice_sizes)
+
+  def Scatter(self, a, scatter_indices, updates, update_computation,
+              dimension_numbers):
+    """Enqueues a Scatter operation onto the computation."""
+    return self._client.Scatter(
+        a, scatter_indices, updates, update_computation.computation,
+        dimension_numbers,)
+
 
 def _forward_methods_to_local_builder():
   """Forward remaining ComputationBuilder methods to the C API.
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 002a20e60a9fbe117af991731a555e60eef9397a..874e087eb6d4b785066edae21b1d11ebb024cd3e 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -1129,6 +1129,21 @@ class SingleOpTest(LocalComputationTest):
     self.assertFalse(c.IsConstant(non_const_expr))
     # self.assertTrue(c.IsConstant(c.Sub(c.Add(x, a), x)))  # TODO(b/77245564)
 
+  def testGather(self):
+    a = np.arange(9).astype(np.int32).reshape((3, 3))
+    indices = np.array([[[0, 2], [2, 1]], [[1, 2], [2, 0]]], dtype=np.int32)
+    dnums = xla_client.xla_data_pb2.GatherDimensionNumbers()
+    dnums.offset_dims.append(1)
+    dnums.offset_dims.append(2)
+    dnums.start_index_map.append(0)
+    dnums.start_index_map.append(1)
+    dnums.index_vector_dim = 2
+    c = self._NewComputation()
+    c.Gather(c.Constant(a), c.Constant(indices), dnums, slice_sizes=[1, 1])
+    g = self._Execute(c, ())
+    expected = np.array([[[[2, 7]]], [[[5, 6]]]], dtype=np.int32)
+    np.testing.assert_allclose(g, expected, rtol=1e-4)
+
 
 class EmbeddedComputationsTest(LocalComputationTest):
   """Tests for XLA graphs with embedded computations (such as maps)."""
@@ -1186,6 +1201,14 @@ class EmbeddedComputationsTest(LocalComputationTest):
     c.Mul(c.ParameterFromNumpy(NumpyArrayF64(0)), c.ConstantF64Scalar(2.0))
     return c.Build()
 
+  def _CreateBinaryAddS32Computation(self):
+    """Computation (s32, s32) -> s32 that adds its two parameters."""
+    c = self._NewComputation("add_param0_by_param1")
+    c.Add(
+        c.ParameterFromNumpy(NumpyArrayS32(0)),
+        c.ParameterFromNumpy(NumpyArrayS32(0)))
+    return c.Build()
+
   def _CreateBinaryAddF32Computation(self):
     """Computation (f32, f32) -> f32 that adds its two parameters."""
     c = self._NewComputation("add_param0_by_param1")
@@ -1568,6 +1591,23 @@ class EmbeddedComputationsTest(LocalComputationTest):
       execution.join()
       self.assertEqual(want, got)
 
+  def testScatter(self):
+    a = np.arange(9).astype(np.int32).reshape((3, 3))
+    scatter_indices = np.array([0, 2], dtype=np.int32)
+    updates = np.array([[10, 20, 30], [70, 80, 90]], dtype=np.int32)
+
+    dnums = xla_client.xla_data_pb2.ScatterDimensionNumbers()
+    dnums.update_window_dims.append(1)
+    dnums.inserted_window_dims.append(0)
+    dnums.scatter_dims_to_operand_dims.append(0)
+    dnums.index_vector_dim = 1
+
+    c = self._NewComputation()
+    c.Scatter(c.Constant(a), c.Constant(scatter_indices), c.Constant(updates),
+              self._CreateBinaryAddS32Computation(), dnums)
+    expected = np.array([[10, 21, 32], [3, 4, 5], [76, 87, 98]], dtype=np.int32)
+    self._ExecuteAndCompareClose(c, expected=expected)
+
 
 class ErrorTest(LocalComputationTest):
 
diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc
index 3ba67f69e8cd1f230095d047ec9075f072e26fd9..08b78ee244844f41d551d7e249cec0cbf157d639 100644
--- a/tensorflow/compiler/xla/reference_util.cc
+++ b/tensorflow/compiler/xla/reference_util.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <array>
 #include <utility>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
@@ -550,7 +551,7 @@ ReferenceUtil::ConvArray4DGeneralDimensionsDilated(
 
   HloEvaluator evaluator;
   Literal result_literal =
-      evaluator.Evaluate<const Literal*>(*computation, {}).ConsumeValueOrDie();
+      evaluator.Evaluate(*computation, {}).ConsumeValueOrDie();
 
   CHECK_EQ(result_literal.shape().rank(), 4);
   auto result =
@@ -605,24 +606,26 @@ ReferenceUtil::ReduceToRowArray2D(
     const std::function<float(float, float)>& reduce_function) {
   std::vector<float> result;
   CHECK_EQ(dims.size(), 3);
-  const std::set<int64> dim_set(dims.begin(), dims.end());
+  const absl::flat_hash_set<int64> dim_set(dims.begin(), dims.end());
   CHECK_EQ(dim_set.size(), 3);
-  for (int64 a0 = 0; a0 == 0 || (!dim_set.count(0) && a0 < array.n1()); ++a0) {
-    for (int64 a1 = 0; a1 == 0 || (!dim_set.count(1) && a1 < array.n2());
+  for (int64 a0 = 0; a0 == 0 || (!dim_set.contains(0) && a0 < array.n1());
+       ++a0) {
+    for (int64 a1 = 0; a1 == 0 || (!dim_set.contains(1) && a1 < array.n2());
          ++a1) {
-      for (int64 a2 = 0; a2 == 0 || (!dim_set.count(2) && a2 < array.n3());
+      for (int64 a2 = 0; a2 == 0 || (!dim_set.contains(2) && a2 < array.n3());
            ++a2) {
-        for (int64 a3 = 0; a3 == 0 || (!dim_set.count(3) && a3 < array.n4());
+        for (int64 a3 = 0; a3 == 0 || (!dim_set.contains(3) && a3 < array.n4());
              ++a3) {
           float accumulator = init;
-          for (int64 i0 = 0; i0 == 0 || (dim_set.count(0) && i0 < array.n1());
-               ++i0) {
-            for (int64 i1 = 0; i1 == 0 || (dim_set.count(1) && i1 < array.n2());
-                 ++i1) {
+          for (int64 i0 = 0;
+               i0 == 0 || (dim_set.contains(0) && i0 < array.n1()); ++i0) {
+            for (int64 i1 = 0;
+                 i1 == 0 || (dim_set.contains(1) && i1 < array.n2()); ++i1) {
               for (int64 i2 = 0;
-                   i2 == 0 || (dim_set.count(2) && i2 < array.n3()); ++i2) {
+                   i2 == 0 || (dim_set.contains(2) && i2 < array.n3()); ++i2) {
                 for (int64 i3 = 0;
-                     i3 == 0 || (dim_set.count(3) && i3 < array.n4()); ++i3) {
+                     i3 == 0 || (dim_set.contains(3) && i3 < array.n4());
+                     ++i3) {
                   // Handle zero-sized arrays.
                   if (array.n1() > 0 && array.n2() > 0 && array.n3() > 0 &&
                       array.n4() > 0) {
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index d8736c819687482a9dead57bdeacff8e75dce105..728adb67a0d64c5dcb7a6e733b5272f6db911419 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1,6 +1,14 @@
 # Description:
 #   XLA service implementation.
 
+load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library_py",
+)
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
+
 licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = [":friends"])
@@ -12,15 +20,6 @@ package_group(
     ],
 )
 
-load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
-load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
-load(
-    "//tensorflow/core:platform/default/build_config.bzl",
-    "tf_proto_library_py",
-)
-
 xla_proto_library(
     name = "hlo_proto",
     srcs = ["hlo.proto"],
@@ -237,6 +236,7 @@ cc_library(
     ],
     hdrs = ["hlo_evaluator.h"],
     deps = [
+        ":dynamic_dimension_inference",
         ":hlo",
         ":hlo_casting_utils",
         ":hlo_query",
@@ -516,6 +516,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -678,6 +679,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//third_party/eigen3",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -696,6 +698,7 @@ cc_library(
         ":compiler",
         ":computation_layout",
         ":device_memory_allocator",
+        ":dynamic_dimension_inference",
         ":executable",
         ":execution_tracker",
         ":hlo",
@@ -1003,6 +1006,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -1137,6 +1141,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -1580,6 +1585,8 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -1866,8 +1873,9 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1931,6 +1939,46 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "dynamic_padder",
+    srcs = ["dynamic_padder.cc"],
+    hdrs = ["dynamic_padder.h"],
+    deps = [
+        ":dynamic_dimension_inference",
+        ":hlo_dce",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+    ],
+)
+
+tf_cc_test(
+    name = "dynamic_padder_test",
+    srcs = ["dynamic_padder_test.cc"],
+    deps = [
+        ":dynamic_padder",
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_runner",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test",
+    ],
+)
+
 tf_cc_test(
     name = "dynamic_dimension_inference_test",
     srcs = ["dynamic_dimension_inference_test.cc"],
@@ -2116,6 +2164,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -2288,6 +2337,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
@@ -2548,6 +2598,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -2592,6 +2643,7 @@ tf_cc_test(
     srcs = ["hlo_verifier_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_module_config",
         ":hlo_parser",
         ":hlo_verifier",
         ":layout_assignment",
@@ -2599,6 +2651,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -2969,6 +3022,7 @@ cc_library(
     srcs = ["hlo_get_dimension_size_rewriter.cc"],
     hdrs = ["hlo_get_dimension_size_rewriter.h"],
     deps = [
+        ":dynamic_dimension_inference",
         ":hlo",
         ":hlo_pass",
         ":shape_inference",
@@ -3186,6 +3240,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:regexp_internal",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
@@ -3403,6 +3458,7 @@ cc_library(
         ":hlo_profile_printer_data",
         ":human_readable_profile_builder",
         "//tensorflow/compiler/xla:types",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -3574,6 +3630,7 @@ cc_library(
 tf_cc_test(
     name = "indexed_array_analysis_test",
     srcs = ["indexed_array_analysis_test.cc"],
+    extra_copts = ["-Wno-string-plus-int"],
     deps = [
         ":hlo_matchers",
         ":indexed_array_analysis",
@@ -3675,6 +3732,7 @@ cc_library(
         ":pattern_matcher",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
@@ -3686,6 +3744,38 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "dynamic_index_splitter",
+    srcs = ["dynamic_index_splitter.cc"],
+    hdrs = ["dynamic_index_splitter.h"],
+    deps = [
+        ":hlo_casting_utils",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "dynamic_index_splitter_test",
+    srcs = ["dynamic_index_splitter_test.cc"],
+    deps = [
+        ":dynamic_index_splitter",
+        ":hlo",
+        ":hlo_matchers",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 tf_cc_test(
     name = "ar_crs_combiner_test",
     srcs = ["ar_crs_combiner_test.cc"],
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index cad70a8d10f4e7bed22b11039309d5c7d02e650f..5ac746c9f3fc0503c143da2f3da8b312ae0a4280 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -26,6 +26,8 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
@@ -369,6 +371,11 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   // Tries to convert slice(reshape(X)) into reshape(slice(X))
   StatusOr<bool> TryToReorderSliceAndReshape(HloInstruction* slice);
 
+  // If the sort instruction has a tuple shape then looks for unused output
+  // values and removes them from the sort instruction. Returns true if the
+  // graph have been modified.
+  StatusOr<bool> RemoveUnusedOperandFromSort(HloInstruction* sort);
+
   // Current HloComputation instance the AlgebraicSimplifierVisitor is
   // traversing.
   HloComputation* computation_;
@@ -1373,6 +1380,9 @@ StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfGather(
   // => output dimensions: DS ({M x N}, {0, start}, {M, 1}) => {M x 1}.
 
   bool lhs_is_dynamic_slice = lhs->opcode() == HloOpcode::kDynamicSlice;
+  HloDynamicSliceInstruction* dynamic_slice =
+      lhs_is_dynamic_slice ? Cast<HloDynamicSliceInstruction>(lhs)
+                           : Cast<HloDynamicSliceInstruction>(rhs);
 
   // ctA:
   HloInstruction* left_operand =
@@ -1390,8 +1400,6 @@ StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfGather(
       HloInstruction::CreateDot(memoized_shape, left_operand, right_operand,
                                 dnums, dot->precision_config()));
   // Get pair {start, 0} or {0, start}.
-  HloInstruction* original_start_indices =
-      lhs_is_dynamic_slice ? lhs->mutable_operand(1) : rhs->mutable_operand(1);
   // Position of start:
   int index_of_non_zero_start = lhs_is_dynamic_slice
                                     ? 1 - lhs_contracting_dimension
@@ -1400,23 +1408,19 @@ StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfGather(
   int index_of_zero_start = 1 - index_of_non_zero_start;
 
   // Slice out start and 0 components and reorder if necessary.
-  auto indices_type = original_start_indices->shape().element_type();
+  auto indices_type = dynamic_slice->operand(1)->shape().element_type();
   Shape s_shape = ShapeUtil::MakeShape(indices_type, {1});
   Shape d_shape = ShapeUtil::MakeShape(indices_type, {2});
   HloInstruction* non_zero_start =
-      computation_->AddInstruction(HloInstruction::CreateSlice(
-          s_shape, original_start_indices, {index_of_non_zero_start},
-          {index_of_non_zero_start + 1}, {1}));
+      dynamic_slice->mutable_operand(1 + index_of_non_zero_start);
   HloInstruction* zero_start =
-      computation_->AddInstruction(HloInstruction::CreateSlice(
-          s_shape, original_start_indices, {index_of_zero_start},
-          {index_of_zero_start + 1}, {1}));
-  HloInstruction* new_start_indices =
-      lhs_is_dynamic_slice
-          ? computation_->AddInstruction(HloInstruction::CreateConcatenate(
-                d_shape, {non_zero_start, zero_start}, 0))
-          : computation_->AddInstruction(HloInstruction::CreateConcatenate(
-                d_shape, {zero_start, non_zero_start}, 0));
+      dynamic_slice->mutable_operand(1 + index_of_zero_start);
+  std::vector<HloInstruction*> new_start_indices;
+  if (lhs_is_dynamic_slice) {
+    new_start_indices = {non_zero_start, zero_start};
+  } else {
+    new_start_indices = {zero_start, non_zero_start};
+  }
 
   // Build DynamicSlice(ctA x ctB).
   const int new_slice_m = lhs_is_dynamic_slice ? 1 : m;
@@ -2209,8 +2213,7 @@ Status AlgebraicSimplifierVisitor::HandleReverse(HloInstruction* reverse) {
   auto dim_is_one = [&](int64 i) -> bool {
     return reverse->shape().dimensions(i) == 1;
   };
-  if (std::all_of(reverse->dimensions().begin(), reverse->dimensions().end(),
-                  dim_is_one)) {
+  if (absl::c_all_of(reverse->dimensions(), dim_is_one)) {
     return ReplaceInstruction(reverse, reverse->mutable_operand(0));
   }
   return Status::OK();
@@ -2485,9 +2488,9 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
     // Create a new reduce with the combined reduction dimensions of both
     // reduces.
     std::vector<int64> arg_dims = arg->dimensions();
-    std::sort(arg_dims.begin(), arg_dims.end());
+    absl::c_sort(arg_dims);
     std::vector<int64> reduce_dims = reduce->dimensions();
-    std::sort(reduce_dims.begin(), reduce_dims.end());
+    absl::c_sort(reduce_dims);
     // Transform reduce_dims to the same rank as the operand of the operand.
     for (int64 arg_dim : arg_dims) {
       for (int64& dim : reduce_dims) {
@@ -2533,7 +2536,7 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
     }
     if (can_move_reshape_into_reduce) {
       changed_ = true;
-      std::unordered_set<int64> dimensions_not_to_reduce;
+      absl::flat_hash_set<int64> dimensions_not_to_reduce;
       for (auto dim_pair : unmodified_dims) {
         if (arg_dim_in_output[dim_pair.second]) {
           dimensions_not_to_reduce.insert(dim_pair.first);
@@ -2541,7 +2544,7 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
       }
       std::vector<int64> new_reduce_dimensions;
       for (int64 i = 0; i < arg->operand(0)->shape().rank(); ++i) {
-        if (dimensions_not_to_reduce.count(i) == 0) {
+        if (!dimensions_not_to_reduce.contains(i)) {
           new_reduce_dimensions.push_back(i);
         }
       }
@@ -2595,51 +2598,53 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
                                   function));
   }
 
-  // A reduce window can be expressed as a reduce and a reshape if all
-  // dimensions either have a window size of one or the entire dimension. If
-  // there is no stride, dilation, or padding, this is as easy as checking the
-  // size of the output shape and window dimension.
-  //
-  // The reshape is a bitcast since it adds one-sized dimensions. Often these
-  // ones are immediately removed as well with another reshape. The
-  // implementation of reduce tends to be slightly more efficient at reducing
-  // entire dimensions compared to reduce window.
-  auto effective_reduce_dims = [&] {
-    if (window_util::HasStride(window) || window_util::HasDilation(window) ||
-        window_util::HasPadding(window)) {
-      return absl::InlinedVector<int64, 8>{};
-    }
-    absl::InlinedVector<int64, 8> reduce_dims;
-    for (int64 i = 0; i < window.dimensions_size(); ++i) {
-      if (window.dimensions(i).size() == 1) {
-        continue;
-      } else if (reduce_window->shape().dimensions(i) == 1) {
-        reduce_dims.push_back(i);
-      } else {
+  if (options_.enable_window_reduce_to_reduce_replacement()) {
+    // A reduce window can be expressed as a reduce and a reshape if all
+    // dimensions either have a window size of one or the entire dimension. If
+    // there is no stride, dilation, or padding, this is as easy as checking the
+    // size of the output shape and window dimension.
+    //
+    // The reshape is a bitcast since it adds one-sized dimensions. Often these
+    // ones are immediately removed as well with another reshape. The
+    // implementation of reduce tends to be slightly more efficient at reducing
+    // entire dimensions compared to reduce window.
+    auto effective_reduce_dims = [&] {
+      if (window_util::HasStride(window) || window_util::HasDilation(window) ||
+          window_util::HasPadding(window)) {
         return absl::InlinedVector<int64, 8>{};
       }
-    }
-    return reduce_dims;
-  }();
+      absl::InlinedVector<int64, 8> reduce_dims;
+      for (int64 i = 0; i < window.dimensions_size(); ++i) {
+        if (window.dimensions(i).size() == 1) {
+          continue;
+        } else if (reduce_window->shape().dimensions(i) == 1) {
+          reduce_dims.push_back(i);
+        } else {
+          return absl::InlinedVector<int64, 8>{};
+        }
+      }
+      return reduce_dims;
+    }();
 
-  // If a reduce window can be expressed as a reduce, do so and reshape the
-  // output.
-  if (!effective_reduce_dims.empty()) {
-    Shape reduce_shape = ShapeUtil::FilterDimensions(
-        [&](int64 dim) {
-          return !absl::c_linear_search(effective_reduce_dims, dim);
-        },
-        reduce_window->shape());
-    HloInstruction* reduce =
-        computation_->AddInstruction(HloInstruction::CreateReduce(
-            /*shape=*/reduce_shape,
-            /*operand=*/operand,
-            /*init_value=*/reduce_window->mutable_operand(1),
-            /*dimensions_to_reduce=*/effective_reduce_dims,
-            /*reduce_computation=*/function));
-    return ReplaceWithNewInstruction(
-        reduce_window,
-        HloInstruction::CreateReshape(reduce_window->shape(), reduce));
+    // If a reduce window can be expressed as a reduce, do so and reshape the
+    // output.
+    if (!effective_reduce_dims.empty()) {
+      Shape reduce_shape = ShapeUtil::FilterDimensions(
+          [&](int64 dim) {
+            return !absl::c_linear_search(effective_reduce_dims, dim);
+          },
+          reduce_window->shape());
+      HloInstruction* reduce =
+          computation_->AddInstruction(HloInstruction::CreateReduce(
+              /*shape=*/reduce_shape,
+              /*operand=*/operand,
+              /*init_value=*/reduce_window->mutable_operand(1),
+              /*dimensions_to_reduce=*/effective_reduce_dims,
+              /*reduce_computation=*/function));
+      return ReplaceWithNewInstruction(
+          reduce_window,
+          HloInstruction::CreateReshape(reduce_window->shape(), reduce));
+    }
   }
 
   // This optimization folds a pad op into reduce_window.
@@ -2814,6 +2819,69 @@ Status AlgebraicSimplifierVisitor::HandleSelect(HloInstruction* select) {
   return Status::OK();
 }
 
+StatusOr<bool> AlgebraicSimplifierVisitor::RemoveUnusedOperandFromSort(
+    HloInstruction* sort) {
+  if (!sort->shape().IsTuple()) {
+    return false;
+  }
+
+  if (sort->parent()->root_instruction() == sort) {
+    // Can't analyse users of the root instruction.
+    return false;
+  }
+
+  // Index 0 is the sorting key used by the sort HLO itself.
+  absl::flat_hash_set<int64> used_indices{0};
+  for (const HloInstruction* user : sort->users()) {
+    if (user->opcode() != HloOpcode::kGetTupleElement) {
+      // Can't analyse users other then get-tuple-element.
+      return false;
+    }
+    used_indices.insert(user->tuple_index());
+  }
+
+  if (used_indices.size() == sort->operand_count()) {
+    // All operands are used.
+    return false;
+  }
+
+  std::vector<HloInstruction*> operands{sort->mutable_operand(0)};
+  std::vector<Shape> new_shapes{sort->operand(0)->shape()};
+  for (int64 i = 1; i < sort->operand_count(); ++i) {
+    if (used_indices.count(i)) {
+      operands.push_back(sort->mutable_operand(i));
+      new_shapes.push_back(sort->operand(i)->shape());
+    }
+  }
+  Shape new_sort_shape = new_shapes.size() == 1
+                             ? new_shapes[0]
+                             : ShapeUtil::MakeTupleShape(new_shapes);
+  HloInstruction* new_sort = computation_->AddInstruction(
+      sort->CloneWithNewOperands(new_sort_shape, operands));
+
+  // Map from original get-tuple-element tuple index to new HLO instruction
+  absl::flat_hash_map<int64, HloInstruction*> result_map;
+  if (new_sort->shape().IsTuple()) {
+    // Old sort key maps to new sort key.
+    int64 new_index = 0;
+    for (int64 i = 0; i < sort->operand_count(); ++i) {
+      if (used_indices.count(i)) {
+        result_map[i] =
+            computation_->AddInstruction(HloInstruction::CreateGetTupleElement(
+                new_shapes[new_index], new_sort, new_index));
+        ++new_index;
+      }
+    }
+  } else {
+    result_map[0] = new_sort;
+  }
+  for (HloInstruction* user : sort->users()) {
+    TF_RETURN_IF_ERROR(
+        user->ReplaceAllUsesWith(result_map.at(user->tuple_index())));
+  }
+  return true;
+}
+
 Status AlgebraicSimplifierVisitor::HandleSort(HloInstruction* sort) {
   auto operand = sort->mutable_operand(0);
   int64 dimension_to_sort = sort->dimensions(0);
@@ -2826,6 +2894,14 @@ Status AlgebraicSimplifierVisitor::HandleSort(HloInstruction* sort) {
     return ReplaceWithNewInstruction(
         sort, HloInstruction::CreateTuple(sort->operands()));
   }
+
+  // Remove the unused values from a key-value sort.
+  TF_ASSIGN_OR_RETURN(bool removed_operand, RemoveUnusedOperandFromSort(sort));
+  if (removed_operand) {
+    changed_ = true;
+    return Status::OK();
+  }
+
   if (!options_.enable_permutation_sort_replacement()) {
     return Status::OK();
   }
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h
index d2775b9fafa7e4c625f5d181114e80e7369f9c78..1acaadaaa03803e467fd9bf6b22f5793fd6a1ec9 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.h
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h
@@ -75,12 +75,24 @@ class AlgebraicSimplifierOptions {
     return enable_permutation_sort_replacement_;
   }
 
+  // If enable_window_reduce_replacement is true, the kReduceWindow instruction
+  // can be optimized by replacement with simpler operations.
+  void set_enable_window_reduce_to_reduce_replacement(
+      bool enable_window_reduce_to_reduce_replacement) {
+    enable_window_reduce_to_reduce_replacement_ =
+        enable_window_reduce_to_reduce_replacement;
+  }
+  bool enable_window_reduce_to_reduce_replacement() const {
+    return enable_window_reduce_to_reduce_replacement_;
+  }
+
  private:
   ValidBitcastCallback valid_bitcast_callback_;
   bool is_layout_sensitive_{false};
   bool enable_dot_strength_reduction_{true};
   bool enable_conv_simplification_{true};
   bool enable_permutation_sort_replacement_{false};
+  bool enable_window_reduce_to_reduce_replacement_{true};
 };
 
 // A pass which performs algebraic simplifications.
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 51ad748ff829966d04e360d457f2ae081ec187eb..916b953cfc25693b1f75095e28b3167dc8c740e1 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -2709,6 +2709,74 @@ TEST_F(AlgebraicSimplifierTest, DontReplacePermutationSortWrongDimensions) {
   EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 }
 
+TEST_F(AlgebraicSimplifierTest, RemoveUnusedSortOperandArrayResult) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+    ENTRY sort_computation {
+      keys = f32[64,8732]{1,0} parameter(0)
+      values = s32[64,8732]{1,0} parameter(1)
+      sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values),
+        dimensions={1}
+      ROOT gte = f32[64,8732]{1,0} get-tuple-element(sort), index=0
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Sort(m::Parameter(0))));
+}
+
+TEST_F(AlgebraicSimplifierTest, RemoveUnusedSortOperandTuple) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+    ENTRY sort_computation {
+      keys = f32[64,87] parameter(0)
+      values.0 = s32[64,87] parameter(1)
+      values.1 = u32[64,87] parameter(2)
+      sort = (f32[64,87], s32[64,87], u32[64,87]) sort(
+          keys, values.0, values.1),
+        dimensions={1}
+      gte.0 = f32[64,87] get-tuple-element(sort), index=0
+      gte.1 = u32[64,87] get-tuple-element(sort), index=2
+      ROOT tuple = (f32[64,87], u32[64,87]) tuple(gte.0, gte.1)
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      GmockMatch(m::Tuple(
+          m::GetTupleElement(m::Sort(m::Parameter(0), m::Parameter(2)), 0),
+          m::GetTupleElement(m::Sort(m::Parameter(0), m::Parameter(2)), 1))));
+}
+
+TEST_F(AlgebraicSimplifierTest, DontRemoveUnusedSortKey) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+    ENTRY sort_computation {
+      keys = f32[64,8732]{1,0} parameter(0)
+      values = s32[64,8732]{1,0} parameter(1)
+      sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values), dimensions={1}
+      ROOT gte = s32[64,8732]{1,0} get-tuple-element(sort), index=1
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+}
+
 TEST_F(AlgebraicSimplifierTest, ReplaceEffectiveScalarKeyValueSortWithTuple) {
   auto builder = HloComputation::Builder(TestName());
 
@@ -3706,12 +3774,16 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicSlice) {
   HloComputation::Builder builder(TestName());
 
   Shape shape = ShapeUtil::MakeShape(F32, {10, 100, 1000});
+  std::vector<HloInstruction*> params;
+  for (int i = 0; i < 3; ++i) {
+    params.push_back(builder.AddInstruction(HloInstruction::CreateParameter(
+        i + 1, ShapeUtil::MakeShape(U32, {}), "slice_indices")));
+  }
   builder.AddInstruction(HloInstruction::CreateDynamicSlice(
       shape,
       builder.AddInstruction(
           HloInstruction::CreateParameter(0, shape, "slice_from")),
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          1, ShapeUtil::MakeShape(U32, {3}), "slice_indices")),
+      params,
       /*slice_sizes=*/{10, 100, 1000}));
 
   auto computation = m->AddEntryComputation(builder.Build());
@@ -3730,28 +3802,35 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicUpdateSlice) {
   Shape full_shape = ShapeUtil::MakeShape(F32, {10, 100, 1000});
   Shape slice_shape = ShapeUtil::MakeShape(F32, {10, 1, 1000});
 
+  std::vector<HloInstruction*> slice_indices, update_indices;
+  for (int i = 0; i < 3; ++i) {
+    slice_indices.push_back(
+        builder.AddInstruction(HloInstruction::CreateParameter(
+            i + 1, ShapeUtil::MakeShape(U32, {}), "slice_indices")));
+    update_indices.push_back(
+        builder.AddInstruction(HloInstruction::CreateParameter(
+            i + 5, ShapeUtil::MakeShape(U32, {}), "update_indices")));
+  }
   HloInstruction* slice =
       builder.AddInstruction(HloInstruction::CreateDynamicSlice(
           slice_shape,
           builder.AddInstruction(
               HloInstruction::CreateParameter(0, full_shape, "slice_from")),
-          builder.AddInstruction(HloInstruction::CreateParameter(
-              1, ShapeUtil::MakeShape(U32, {3}), "slice_indices")),
+          slice_indices,
           /*slice_sizes=*/{10, 1, 1000}));
 
   builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
       slice_shape,
       builder.AddInstruction(
-          HloInstruction::CreateParameter(2, slice_shape, "to_update")),
-      slice,
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          3, ShapeUtil::MakeShape(U32, {3}), "update_indices"))));
+          HloInstruction::CreateParameter(4, slice_shape, "to_update")),
+      slice, update_indices));
 
   auto computation = m->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
-              GmockMatch(m::DynamicSlice(m::Parameter(), m::Parameter())));
+              GmockMatch(m::DynamicSlice(m::Parameter(), m::Parameter(),
+                                         m::Parameter(), m::Parameter())));
 }
 
 // Test that two consecutive broadcasts can be merged to one.
@@ -4412,9 +4491,10 @@ TEST_F(AlgebraicSimplifierTest, DynamicUpdateSliceZeroUpdate) {
   HloInstruction* const update = builder.AddInstruction(
       HloInstruction::CreateParameter(1, update_shape, "update"));
   HloInstruction* const start_indices = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int>({0})));
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int>({})));
   builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-      dslice_shape, operand, update, start_indices));
+      dslice_shape, operand, update,
+      std::initializer_list<HloInstruction*>({start_indices})));
   const HloComputation* const computation =
       m->AddEntryComputation(builder.Build());
 
@@ -4467,14 +4547,17 @@ TEST_P(DotOfGatherSimplificationTest, ConstantRHS) {
 
   int32 start_row = (spec.lcd == 0) ? 0 : spec.s;
   int32 start_col = (spec.lcd == 0) ? spec.s : 0;
-  const auto start_indices =
+  std::vector<HloInstruction*> start_indices = {
+      builder.AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateR0<int32>(start_row))),
       builder.AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR1<int32>({start_row, start_col})));
+          LiteralUtil::CreateR0<int32>(start_col)))};
   int64 slice_row_size = (spec.lcd == 0) ? spec.k : 1;
   int64 slice_col_size = (spec.lcd == 0) ? 1 : spec.k;
-  Shape ds_shape = ShapeUtil::MakeShape(F32, {slice_row_size, slice_col_size});
+  std::vector<int64> slice_sizes = {slice_row_size, slice_col_size};
+  Shape ds_shape = ShapeUtil::MakeShape(F32, slice_sizes);
   auto* ds = builder.AddInstruction(HloInstruction::CreateDynamicSlice(
-      ds_shape, lhs, start_indices, {slice_row_size, slice_col_size}));
+      ds_shape, lhs, start_indices, slice_sizes));
 
   int64 rhs_rows = (spec.rcd == 0) ? spec.k : spec.n;
   int64 rhs_cols = (spec.rcd == 0) ? spec.n : spec.k;
@@ -4507,7 +4590,7 @@ TEST_P(DotOfGatherSimplificationTest, ConstantRHS) {
   } else {
     EXPECT_THAT(computation->root_instruction(),
                 GmockMatch(m::DynamicSlice(m::Dot(m::Constant(), m::Constant()),
-                                           m::Concatenate())));
+                                           m::Constant(), m::Constant())));
   }
 }
 
@@ -4545,14 +4628,17 @@ TEST_P(DotOfGatherSimplificationTest, ConstantLHS) {
 
   int32 start_row = (spec.rcd == 0) ? 0 : spec.s;
   int32 start_col = (spec.rcd == 0) ? spec.s : 0;
-  const auto start_indices =
+  std::vector<HloInstruction*> start_indices = {
+      builder.AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateR0<int32>(start_row))),
       builder.AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR1<int32>({start_row, start_col})));
+          LiteralUtil::CreateR0<int32>(start_col)))};
   int64 slice_row_size = (spec.rcd == 0) ? spec.k : 1;
   int64 slice_col_size = (spec.rcd == 0) ? 1 : spec.k;
-  Shape ds_shape = ShapeUtil::MakeShape(F32, {slice_row_size, slice_col_size});
+  std::vector<int64> slice_sizes = {slice_row_size, slice_col_size};
+  Shape ds_shape = ShapeUtil::MakeShape(F32, slice_sizes);
   auto* ds = builder.AddInstruction(HloInstruction::CreateDynamicSlice(
-      ds_shape, rhs, start_indices, {slice_row_size, slice_col_size}));
+      ds_shape, rhs, start_indices, slice_sizes));
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(spec.lcd);
@@ -4577,7 +4663,7 @@ TEST_P(DotOfGatherSimplificationTest, ConstantLHS) {
   } else {
     EXPECT_THAT(computation->root_instruction(),
                 GmockMatch(m::DynamicSlice(m::Dot(m::Constant(), m::Constant()),
-                                           m::Concatenate())));
+                                           m::Constant(), m::Constant())));
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.cc b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
index 47d2c7e35705698d49950c2fa042af1c6327d521..35a32f537b81fcd4bcbda4aaf8f56ea900559905 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner.cc
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -44,11 +45,24 @@ bool MatchesArCrsPattern(HloInstruction* instruction) {
     if (instruction->user_count() != 1) {
       return false;
     }
-    auto opcode = instruction->opcode();
-    return opcode == HloOpcode::kBitcast || opcode == HloOpcode::kTranspose ||
-           opcode == HloOpcode::kReshape || opcode == HloOpcode::kConvert ||
-           opcode == HloOpcode::kAdd || opcode == HloOpcode::kSubtract ||
-           opcode == HloOpcode::kMultiply;
+    switch (instruction->opcode()) {
+      case HloOpcode::kBitcast:
+      case HloOpcode::kTranspose:
+      case HloOpcode::kReshape:
+        return true;
+      case HloOpcode::kConvert:
+        // Can be moved across if both input and output is either float or
+        // integer (e.g. S32<->U32 or F32<->BF16)
+        return ShapeUtil::ElementIsFloating(instruction->shape()) ==
+               ShapeUtil::ElementIsFloating(instruction->operand(0)->shape());
+      case HloOpcode::kAdd:
+      case HloOpcode::kSubtract:
+      case HloOpcode::kMultiply:
+        // Only supported for floating point operands.
+        return ShapeUtil::ElementIsFloating(instruction->shape());
+      default:
+        return false;
+    }
   };
 
   auto computation_is_addition = [](HloComputation* c) {
@@ -176,6 +190,15 @@ bool ArCrsCombiner::InstructionsComputeSameValue(
   if (opcode1 != i2->opcode() || operands1.size() != i2->operands().size()) {
     return false;
   }
+  auto eq_computations = [](const HloComputation* a, const HloComputation* b) {
+    return *a == *b;
+  };
+  if (i1->IsCrossModuleAllReduce()) {
+    return i1->Identical(*i2,
+                         /*eq_operands=*/std::equal_to<const HloInstruction*>(),
+                         eq_computations,
+                         /*layout_sensitive=*/false);
+  }
   visited_pairs->emplace(min_uid, max_uid);
   for (int i = 0; i < operands1.size(); ++i) {
     auto operand1 = operands1[i];
@@ -201,9 +224,6 @@ bool ArCrsCombiner::InstructionsComputeSameValue(
   // InstructionsComputeSameValue earlier.
   auto eq_instructions = [](const HloInstruction* i1,
                             const HloInstruction* i2) -> bool { return true; };
-  auto eq_computations = [](const HloComputation* a, const HloComputation* b) {
-    return *a == *b;
-  };
   return i1->Identical(*i2, eq_instructions, eq_computations,
                        /*layout_sensitive=*/false);
 }
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
index caa57296f465698eb70d7cb8327d4678f394b323..b12b63b2dd779579307c1c4d4bf7860f5955af4d 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
@@ -360,6 +360,7 @@ HloModule foobar
 
 ENTRY %entrycomp (p: bf16[]) -> (f32[], f32[]) {
   %p = bf16[] parameter(0)
+  %constant.bf16 = bf16[] constant(1)
 
   %all-reduce.ar.1 = bf16[]
       all-reduce(%p),
@@ -377,7 +378,7 @@ ENTRY %entrycomp (p: bf16[]) -> (f32[], f32[]) {
       sharding={maximal device=0}
 
   %all-reduce.ar.2 = bf16[]
-      all-reduce(%p),
+      all-reduce(%constant.bf16),
       replica_groups={{0},{1}},
       all_reduce_id=1,
       to_apply=%sum.bf16,
@@ -407,7 +408,7 @@ ENTRY %entrycomp (p: bf16[]) -> (f32[], f32[]) {
   EXPECT_TRUE(changed);
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               op::Tuple(op::AllReduce(op::Convert(op::Parameter())),
-                        op::AllReduce(op::Convert(op::Parameter()))));
+                        op::AllReduce(op::Convert(op::Constant()))));
   auto crs_after =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_after = crs_after->replica_groups();
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index 2cf24a9dd5fa18abe9dde4eb49b03c6586bfef03..215e8ced4bb3f98a26ac4eb9912a7fd4d917852f 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -115,12 +115,10 @@ StatusOr<StreamPool::Ptr> Backend::BorrowStream(int device_ordinal) {
 
 StatusOr<StreamPool::Ptr> Backend::BorrowStream(se::StreamExecutor* executor) {
   tensorflow::mutex_lock l(mu_);
-  if (0 == stream_pools_.count(executor)) {
-    stream_pools_.emplace(std::piecewise_construct,
-                          std::forward_as_tuple(executor),
-                          std::forward_as_tuple());
+  if (!stream_pools_.contains(executor)) {
+    stream_pools_.emplace(executor, absl::make_unique<StreamPool>());
   }
-  return stream_pools_.at(executor).BorrowStream(executor);
+  return stream_pools_.at(executor)->BorrowStream(executor);
 }
 
 Backend::Backend(se::Platform* platform, Compiler* compiler,
diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h
index 7ca993fb2656037951d98d9c4459a3c3e4c64c61..c35f033dc0180409ae3888c2050021da83f5c72a 100644
--- a/tensorflow/compiler/xla/service/backend.h
+++ b/tensorflow/compiler/xla/service/backend.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
@@ -175,7 +176,8 @@ class Backend {
   tensorflow::mutex mu_;
 
   // Mapping from stream executor to stream pools, used by `BorrowStream` above.
-  std::map<se::StreamExecutor*, StreamPool> stream_pools_ GUARDED_BY(mu_);
+  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<StreamPool>>
+      stream_pools_ GUARDED_BY(mu_);
 
   // The default memory allocator to use.
   std::unique_ptr<StreamExecutorMemoryAllocator> memory_allocator_;
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 6f4c1104f363146cebea83873b936ea3f8292801..d07615b828990f80e2f905837c46f5f2e15d5a63 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -86,10 +86,9 @@ std::vector<int64> ColorInterferenceGraph(
   // first, but it would be good to investigate other ordering heuristics too.
   std::vector<int64> nodes(node_count);
   std::iota(nodes.begin(), nodes.end(), 0);
-  std::sort(nodes.begin(), nodes.end(),
-            [&interference_map](const int64 i, const int64 j) {
-              return interference_map[i].size() > interference_map[j].size();
-            });
+  absl::c_sort(nodes, [&interference_map](const int64 i, const int64 j) {
+    return interference_map[i].size() > interference_map[j].size();
+  });
 
   const int64 kColorUnassigned = -1;
   std::vector<int64> assigned_colors(node_count, kColorUnassigned);
@@ -138,8 +137,8 @@ Status GatherComputationsByAllocationType(
     worklist.pop_front();
     const HloComputation* computation = worklist_front.first;
     bool is_thread_local = worklist_front.second;
-    bool in_thread_local_set = thread_local_set.count(computation) > 0;
-    bool in_global_set = global_set.count(computation) > 0;
+    bool in_thread_local_set = thread_local_set.contains(computation);
+    bool in_global_set = global_set.contains(computation);
 
     // If the computation has already been added to the respective set, then
     // nothing to do.
@@ -207,9 +206,9 @@ Status GatherComputationsByAllocationType(
 
   // Add the computations to the vectors in post order.
   for (auto* computation : module->MakeComputationPostOrder()) {
-    if (thread_local_set.count(computation) > 0) {
+    if (thread_local_set.contains(computation)) {
       thread_local_computations->push_back(computation);
-    } else if (global_set.count(computation) > 0) {
+    } else if (global_set.contains(computation)) {
       global_computations->push_back(computation);
     }
     // If the computation is not reachable from the entry computation, then it
@@ -219,13 +218,6 @@ Status GatherComputationsByAllocationType(
   return Status::OK();
 }
 
-size_t BufferAllocation::Slice::Hasher::operator()(Slice s) const {
-  uint64 h = std::hash<int64>()(s.index());
-  h = tensorflow::Hash64Combine(h, std::hash<int64>()(s.offset()));
-  h = tensorflow::Hash64Combine(h, std::hash<int64>()(s.size()));
-  return h;
-}
-
 string BufferAllocation::Slice::ToString() const {
   return absl::StrCat("{index:", index(), ", offset:", offset_,
                       ", size:", size_, "}");
@@ -240,7 +232,7 @@ BufferAllocation::Slice BufferAllocation::GetSlice(
 void BufferAllocation::AddAssignment(const LogicalBuffer& buffer, int64 offset,
                                      int64 size) {
   VLOG(4) << "Trying to add " << buffer << " to allocation #" << index();
-  CHECK(assigned_buffers_.count(&buffer) == 0)
+  CHECK(!assigned_buffers_.contains(&buffer))
       << "LogicalBuffer " << buffer << " already assigned to allocation "
       << index_;
   CHECK_LE(offset, size_) << "LogicalBuffer " << buffer
@@ -279,11 +271,12 @@ BufferAllocationProto BufferAllocation::ToProto() const {
     proto_assigned->set_offset(buffer_offset_size.second.offset);
     proto_assigned->set_size(buffer_offset_size.second.size);
   }
-  std::sort(proto.mutable_assigned()->begin(), proto.mutable_assigned()->end(),
-            [](const BufferAllocationProto::Assigned& assign1,
-               const BufferAllocationProto::Assigned& assign2) {
-              return assign1.logical_buffer_id() < assign2.logical_buffer_id();
-            });
+  absl::c_sort(*proto.mutable_assigned(),
+               [](const BufferAllocationProto::Assigned& assign1,
+                  const BufferAllocationProto::Assigned& assign2) {
+                 return assign1.logical_buffer_id() <
+                        assign2.logical_buffer_id();
+               });
   return proto;
 }
 
@@ -315,10 +308,10 @@ string BufferAllocation::ToString() const {
   for (const auto& buffer_offset_size : assigned_buffers_) {
     sorted_buffers.push_back(buffer_offset_size.first);
   }
-  std::sort(sorted_buffers.begin(), sorted_buffers.end(),
-            [](const LogicalBuffer* a, const LogicalBuffer* b) {
-              return a->id() < b->id();
-            });
+  absl::c_sort(sorted_buffers,
+               [](const LogicalBuffer* a, const LogicalBuffer* b) {
+                 return a->id() < b->id();
+               });
   for (const LogicalBuffer* buffer : sorted_buffers) {
     const OffsetSize& offset_size = FindOrDie(assigned_buffers_, buffer);
     StrAppend(&output, absl::StrFormat(
@@ -346,7 +339,7 @@ const PointsToSet& BufferAssignment::GetPointsToSet(
 
 bool BufferAssignment::HasAllocation(const LogicalBuffer& buffer) const {
   TF_CHECK_OK(points_to_analysis().VerifyBuffer(buffer));
-  return allocation_index_for_buffer_.count(&buffer) > 0;
+  return allocation_index_for_buffer_.contains(&buffer);
 }
 
 const BufferAllocation& BufferAssignment::GetAssignedAllocation(
@@ -401,7 +394,7 @@ bool BufferAssignment::HasAllocationAt(const HloInstruction* instruction,
                                        const ShapeIndex& index) const {
   for (const LogicalBuffer* buffer :
        GetPointsToSet(instruction).element(index)) {
-    if (allocation_index_for_buffer_.count(buffer) > 0) {
+    if (allocation_index_for_buffer_.contains(buffer)) {
       return true;
     }
   }
@@ -459,8 +452,7 @@ bool BufferAssignment::SharesSliceAtIndex(
 
 bool BufferAssignment::HaveDisjointSlices(const HloInstruction* hlo_a,
                                           const HloInstruction* hlo_b) const {
-  using SliceSet =
-      flat_hash_set<BufferAllocation::Slice, BufferAllocation::Slice::Hasher>;
+  using SliceSet = flat_hash_set<BufferAllocation::Slice>;
   // Gets the slices all of instr's subshapes.  If any subshape doesn't have an
   // assigned slice, returns the empty set.
   auto collect_slices = [&](const HloInstruction* instr) -> SliceSet {
@@ -487,10 +479,9 @@ bool BufferAssignment::HaveDisjointSlices(const HloInstruction* hlo_a,
   // didn't return the empty set) for both HLOs, and the two resulting sets of
   // slices are disjoint.
   return !slices_a.empty() && !slices_b.empty() &&
-         std::none_of(slices_a.begin(), slices_a.end(),
-                      [&](const BufferAllocation::Slice& slice) {
-                        return slices_b.count(slice) > 0;
-                      });
+         absl::c_none_of(slices_a, [&](const BufferAllocation::Slice& slice) {
+           return slices_b.contains(slice);
+         });
 }
 
 StatusOr<BufferAllocation::Slice>
@@ -519,7 +510,7 @@ BufferAllocation* BufferAssignment::NewAllocation(const LogicalBuffer& buffer,
 void BufferAssignment::AddAssignment(BufferAllocation* allocation,
                                      const LogicalBuffer& buffer, int64 offset,
                                      int64 size) {
-  CHECK_EQ(0, allocation_index_for_buffer_.count(&buffer))
+  CHECK(!allocation_index_for_buffer_.contains(&buffer))
       << "LogicalBuffer " << buffer << " already has an allocation.";
   CHECK(allocation->is_reusable() || allocation->assigned_buffers().empty())
       << "Non-reusable allocation already assigned a buffer: "
@@ -960,35 +951,35 @@ Status BufferAssigner::AssignBuffersForComputation(
   // operands (assuming operands are the same/larger size) enabling the
   // important reuse case where an elementwise instruction reuses one of its
   // operand's buffer. This improves locality.
-  std::sort(sorted_buffers.begin(), sorted_buffers.end(),
-            [has_sequential_order, &liveness, &post_order_position, assignment](
-                const LogicalBuffer* a, const LogicalBuffer* b) {
-              // Primary sort is by decreasing buffer size.
-              const int64 a_size = assignment->buffer_size_(*a);
-              const int64 b_size = assignment->buffer_size_(*b);
-              if (a_size != b_size) {
-                return a_size > b_size;  // use ">" for decreasing size.
-              }
-              // Otherwise live out buffers come before others, if the
-              // instructions are sequentially ordered.
-              if (has_sequential_order) {
-                const bool a_live_out = liveness.MaybeLiveOut(*a);
-                const bool b_live_out = liveness.MaybeLiveOut(*b);
-                if (a_live_out != b_live_out) {
-                  return a_live_out;
-                }
-              }
-              // Final tiebreaker is in instruction post order.
-              return post_order_position.at(a->instruction()) <
-                     post_order_position.at(b->instruction());
-            });
+  absl::c_sort(sorted_buffers,
+               [has_sequential_order, &liveness, &post_order_position,
+                assignment](const LogicalBuffer* a, const LogicalBuffer* b) {
+                 // Primary sort is by decreasing buffer size.
+                 const int64 a_size = assignment->buffer_size_(*a);
+                 const int64 b_size = assignment->buffer_size_(*b);
+                 if (a_size != b_size) {
+                   return a_size > b_size;  // use ">" for decreasing size.
+                 }
+                 // Otherwise live out buffers come before others, if the
+                 // instructions are sequentially ordered.
+                 if (has_sequential_order) {
+                   const bool a_live_out = liveness.MaybeLiveOut(*a);
+                   const bool b_live_out = liveness.MaybeLiveOut(*b);
+                   if (a_live_out != b_live_out) {
+                     return a_live_out;
+                   }
+                 }
+                 // Final tiebreaker is in instruction post order.
+                 return post_order_position.at(a->instruction()) <
+                        post_order_position.at(b->instruction());
+               });
 
   // BufferAllocations are necessarily created in decreasing size order. Keep
   // indices of previously created BufferAllocations in allocation_indices.
   std::vector<BufferAllocation::Index> allocation_indices;
   for (const LogicalBuffer* buffer : sorted_buffers) {
     VLOG(3) << "Assigning allocation to: " << *buffer;
-    if (colocated_buffers.count(buffer) > 0) {
+    if (colocated_buffers.contains(buffer)) {
       // Colocated buffers are currently assigned in an earlier pass.
       VLOG(3) << "Skipping colocated buffer: " << *buffer;
       continue;
@@ -1056,7 +1047,7 @@ Status BufferAssigner::AssignBuffersForComputation(
              assignment->GetAllSlices(operand, /*index=*/{})) {
           BufferAllocation* allocation =
               assignment->GetMutableAllocation(operand_slice.index());
-          if (colocated_allocations.count(allocation->index()) == 0) {
+          if (!colocated_allocations.contains(allocation->index())) {
             // TODO(b/32491382) Colocated buffers are currently assigned in an
             // earlier pass, and so can break the "increasing allocation size"
             // invariant in this function (causing this CHECK to fail). However,
@@ -1087,7 +1078,7 @@ Status BufferAssigner::AssignBuffersForComputation(
         // Instructions are iterated in increasing buffer size, so any
         // previously create allocation must be large enough to hold this
         // instruction's output (with the exception of colocated buffers).
-        if (colocated_allocations.count(allocation->index()) == 0) {
+        if (!colocated_allocations.contains(allocation->index())) {
           // TODO(b/32491382) Colocated buffers are currently assigned in an
           // earlier pass, and so can break the "increasing allocation size"
           // invariant in this function (causing this CHECK to fail). However,
@@ -1313,10 +1304,10 @@ std::vector<const LogicalBuffer*> ComputePeakMemoryLogicalBuffers(
                              live_buffers.end());
 
   // Stabily sort the live buffers.
-  std::sort(live_buffers_vector.begin(), live_buffers_vector.end(),
-            [](const LogicalBuffer* a, const LogicalBuffer* b) {
-              return a->id() < b->id();
-            });
+  absl::c_sort(live_buffers_vector,
+               [](const LogicalBuffer* a, const LogicalBuffer* b) {
+                 return a->id() < b->id();
+               });
   return live_buffers_vector;
 }
 
@@ -1376,7 +1367,7 @@ void BufferAssigner::AddSetToColocatedBufferSets(
   std::vector<size_t> overlap_set_indices;
   for (size_t index = 0; index < colocated_buffer_sets->size(); ++index) {
     for (const LogicalBuffer* buffer : colocated_set) {
-      if ((*colocated_buffer_sets)[index].count(buffer) > 0) {
+      if ((*colocated_buffer_sets)[index].contains(buffer)) {
         VLOG(5) << "Found overlap with existing set on buffer "
                 << buffer->ToString() << "\n"
                 << ColocatedBufferSetsToString((*colocated_buffer_sets)[index],
@@ -1539,15 +1530,16 @@ void BufferAssigner::BuildColocatedBufferSets(
   VLOG(4) << "Input/Output Alias Config: ";
   VLOG(4) << module->input_output_alias_config();
   module->input_output_alias_config().ForEachAlias(
-      [&](const ShapeIndex& output_index, int64 param_number,
-          const ShapeIndex& param_index) {
+      [&](const ShapeIndex& output_index,
+          const HloInputOutputAliasConfig::Alias& alias) {
         std::vector<const LogicalBuffer*> colocated_set;
         AddBufferToColocatedSet(module->entry_computation()->root_instruction(),
                                 output_index, points_to_analysis,
                                 &colocated_set);
         AddBufferToColocatedSet(
-            module->entry_computation()->parameter_instruction(param_number),
-            param_index, points_to_analysis, &colocated_set);
+            module->entry_computation()->parameter_instruction(
+                alias.parameter_number),
+            alias.parameter_index, points_to_analysis, &colocated_set);
         AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
       });
 
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index 0a9fdede803e84ca42472259084615c031b206eb..4baab9b6ad71293d48d5ed70c2922fdf40ef119a 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -186,9 +186,10 @@ class BufferAllocation {
              end > other.offset_;
     }
 
-    struct Hasher {
-      size_t operator()(Slice s) const;
-    };
+    template <typename H>
+    friend H AbslHashValue(H h, const Slice& s) {
+      return H::combine(std::move(h), s.index(), s.offset(), s.size());
+    }
 
     string ToString() const;
 
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index 8f482e6ba8c3e71c9980be5e6947ea61f3b4ef29..1b4e93a2f303e5aad3e4081f36e2417277f62c71 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/buffer_value.h"
@@ -309,7 +310,7 @@ class BufferAssignmentTest : public HloTestBase {
 static bool BuffersDistinct(const std::vector<const HloInstruction*>& a,
                             const std::vector<const HloInstruction*>& b,
                             const BufferAssignment& assignment) {
-  std::set<BufferAllocation::Slice> a_slices;
+  absl::flat_hash_set<BufferAllocation::Slice> a_slices;
   for (const HloInstruction* instruction : a) {
     if (assignment.HasTopLevelAllocation(instruction)) {
       a_slices.insert(
@@ -319,8 +320,8 @@ static bool BuffersDistinct(const std::vector<const HloInstruction*>& a,
 
   for (const HloInstruction* instruction : b) {
     if (assignment.HasTopLevelAllocation(instruction)) {
-      if (a_slices.count(assignment.GetUniqueTopLevelSlice(instruction)
-                             .ConsumeValueOrDie())) {
+      if (a_slices.contains(assignment.GetUniqueTopLevelSlice(instruction)
+                                .ConsumeValueOrDie())) {
         return false;
       }
     }
@@ -2485,9 +2486,9 @@ while_body {
   get-tuple-element.3 = s32[] get-tuple-element(state), index=0
   constant.2 = s32[] constant(128)
   add.5 = s32[] add(get-tuple-element.3, constant.2)
-  constant.3 = s32[3]{0} constant({0, 0, 0})
-  dynamic-update-slice.5 = f32[1280,1,128]{2,1,0} dynamic-update-slice(get-tuple-element.4, broadcast.6, constant.3)
-  dynamic-update-slice.9 = f32[1280,1,128]{2,1,0} dynamic-update-slice(dynamic-update-slice.5, broadcast.6, constant.3)
+  constant.3 = s32[] constant(0)
+  dynamic-update-slice.5 = f32[1280,1,128]{2,1,0} dynamic-update-slice(get-tuple-element.4, broadcast.6, constant.3, constant.3, constant.3)
+  dynamic-update-slice.9 = f32[1280,1,128]{2,1,0} dynamic-update-slice(dynamic-update-slice.5, broadcast.6, constant.3, constant.3, constant.3)
   ROOT tuple.85 = (s32[], s32[], s32[2]{0}, f32[1280,1,128]{2,1,0}) tuple(add.5, dynamic-update-slice.9)
 }
 
diff --git a/tensorflow/compiler/xla/service/call_graph_test.cc b/tensorflow/compiler/xla/service/call_graph_test.cc
index 79241342005d3dc59051ea638798062ff56f11a9..5de724f8924b78008ba4c56603b61bf93fbc5e7c 100644
--- a/tensorflow/compiler/xla/service/call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/call_graph_test.cc
@@ -384,7 +384,7 @@ TEST_F(CallGraphTest, ComplexGraph) {
 
   // Verify visitation order of some computations in the graph.
   auto index_of = [&visited](const HloComputation* comp) {
-    auto it = std::find(visited.begin(), visited.end(), comp);
+    auto it = absl::c_find(visited, comp);
     EXPECT_NE(it, visited.end());
     return std::distance(visited.begin(), it);
   };
diff --git a/tensorflow/compiler/xla/service/channel_tracker.cc b/tensorflow/compiler/xla/service/channel_tracker.cc
index 3c2d1ae6d82ebc6c10d52194fd1cec5e291025f7..b517495f2ea0c75679685c67f757ff586f8c79e3 100644
--- a/tensorflow/compiler/xla/service/channel_tracker.cc
+++ b/tensorflow/compiler/xla/service/channel_tracker.cc
@@ -72,7 +72,7 @@ ChannelHandle ChannelTracker::AllocateHandle(ChannelHandle::ChannelType type) {
 }
 
 Status ChannelTracker::RegisterSendInternal(const ChannelHandle& handle) {
-  if (opaque_to_channel_.count(handle.handle()) == 0) {
+  if (!opaque_to_channel_.contains(handle.handle())) {
     return NotFound("channel handle not found: %d", handle.handle());
   }
   Channel& channel = opaque_to_channel_[handle.handle()];
@@ -94,7 +94,7 @@ Status ChannelTracker::RegisterSendInternal(const ChannelHandle& handle) {
 }
 
 Status ChannelTracker::RegisterRecvInternal(const ChannelHandle& handle) {
-  if (opaque_to_channel_.count(handle.handle()) == 0) {
+  if (!opaque_to_channel_.contains(handle.handle())) {
     return NotFound("channel handle not found: %d", handle.handle());
   }
   Channel& channel = opaque_to_channel_[handle.handle()];
diff --git a/tensorflow/compiler/xla/service/channel_tracker.h b/tensorflow/compiler/xla/service/channel_tracker.h
index 52037bf9b52556c6aa2e66dd3209e25cf085cfe3..89e17eba36f23077ce4cf0704e7455b76bee68d1 100644
--- a/tensorflow/compiler/xla/service/channel_tracker.h
+++ b/tensorflow/compiler/xla/service/channel_tracker.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <map>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/status.h"
@@ -83,7 +84,8 @@ class ChannelTracker {
 
   // Mapping from ChannelHandle value to the corresponding registered
   // Channel object.
-  std::map<int64, Channel> opaque_to_channel_ GUARDED_BY(channel_mutex_);
+  absl::flat_hash_map<int64, Channel> opaque_to_channel_
+      GUARDED_BY(channel_mutex_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(ChannelTracker);
 };
diff --git a/tensorflow/compiler/xla/service/compiler.cc b/tensorflow/compiler/xla/service/compiler.cc
index 8f08c244908efb823b3870c19bdc3491fa87d44f..653f4555a77cc82e91fb1cd26206b93826375732 100644
--- a/tensorflow/compiler/xla/service/compiler.cc
+++ b/tensorflow/compiler/xla/service/compiler.cc
@@ -98,10 +98,17 @@ Compiler::GetPlatformCompilers() {
   auto* factories = GetPlatformCompilerFactories();
   auto it = factories->find(platform->id());
   if (it == factories->end()) {
+    string hint;
+    if (platform->Name() == "Host") {
+      hint = " (hint: try linking in tensorflow/compiler/jit:xla_cpu_jit)";
+    } else if (platform->Name() == "CUDA") {
+      hint = " (hint: try linking in tensorflow/compiler/jit:xla_gpu_jit)";
+    }
+
     return NotFound(
         "could not find registered compiler for platform %s -- check "
-        "target linkage",
-        platform->Name());
+        "target linkage%s",
+        platform->Name(), hint);
   }
 
   // And then we invoke the factory, placing the result into the mapping.
diff --git a/tensorflow/compiler/xla/service/computation_layout.cc b/tensorflow/compiler/xla/service/computation_layout.cc
index efc893818d03a20d6bd65b7dc1da72ea5da5ceb0..92d1ca4ba5da802a5f1c544017ac52dda38e9b1d 100644
--- a/tensorflow/compiler/xla/service/computation_layout.cc
+++ b/tensorflow/compiler/xla/service/computation_layout.cc
@@ -42,8 +42,8 @@ void ComputationLayout::SetToDefaultLayout() {
 }
 
 bool ComputationLayout::LayoutIsSet() const {
-  return std::all_of(parameter_layouts_.begin(), parameter_layouts_.end(),
-                     [](const ShapeLayout& s) { return s.LayoutIsSet(); }) &&
+  return absl::c_all_of(parameter_layouts_,
+                        [](const ShapeLayout& s) { return s.LayoutIsSet(); }) &&
          result_layout_.LayoutIsSet();
 }
 
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index e0165d557deb390636b42df12f099792042f1b65..5e26a63cebfa9b2e50f4b13335c10c246999d4df 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -349,11 +349,12 @@ Status AddCopiesForAliasedInputOutputs(HloModule* module) {
     ShapeTree<bool> param_indices_to_copy(param->shape());
 
     module->input_output_alias_config().ForEachAlias(
-        [&](const ShapeIndex& output_index, int64 param_number,
-            const ShapeIndex& param_index) {
-          if (param_number == param->parameter_number()) {
+        [&](const ShapeIndex& output_index,
+            const HloInputOutputAliasConfig::Alias& alias) {
+          if (alias.parameter_number == param->parameter_number()) {
             param_has_alias = true;
-            *(param_indices_to_copy.mutable_element(param_index)) = true;
+            *(param_indices_to_copy.mutable_element(alias.parameter_index)) =
+                true;
             *(output_indices_to_copy.mutable_element(output_index)) = true;
           }
         });
@@ -395,13 +396,14 @@ Status AddCopiesForAliasedInputOutputs(HloModule* module) {
 
   // Add control dependencies between the input/output copies.
   TF_RETURN_IF_ERROR(module->input_output_alias_config().ForEachAliasWithStatus(
-      [&](const ShapeIndex& output_index, int64 param_number,
-          const ShapeIndex& input_index) -> Status {
-        if (!copied_parameters[param_number]) {
+      [&](const ShapeIndex& output_index,
+          const HloInputOutputAliasConfig::Alias& alias) -> Status {
+        if (!copied_parameters[alias.parameter_number]) {
           return Status::OK();
         }
         HloInstruction* from =
-            copied_parameters[param_number]->element(input_index);
+            copied_parameters[alias.parameter_number]->element(
+                alias.parameter_index);
         HloInstruction* to = output_copy_tree.element(output_index);
 
         TF_RET_CHECK(from != nullptr);
@@ -539,10 +541,9 @@ class CopyRemover {
         }
 
         std::vector<const HloValue*> values = buffer.values();
-        std::sort(values.begin(), values.end(),
-                  [this](const HloValue* a, const HloValue* b) {
-                    return ordering_.IsDefinedBefore(*a, *b);
-                  });
+        absl::c_sort(values, [this](const HloValue* a, const HloValue* b) {
+          return ordering_.IsDefinedBefore(*a, *b);
+        });
 
         // Create a list containing all of the values in the buffer.
         AddValueList(values, &value_to_node);
@@ -842,12 +843,11 @@ class CopyRemover {
       copy_value_node->next->prev = operand_node;
 
       // Patch up uses. Remove use of copy from operand_node uses.
-      auto it =
-          std::find_if(operand_node->uses.begin(), operand_node->uses.end(),
-                       [copy_value_node](const HloUse* use) {
-                         return use->instruction ==
-                                copy_value_node->value->defining_instruction();
-                       });
+      auto it = absl::c_find_if(
+          operand_node->uses, [copy_value_node](const HloUse* use) {
+            return use->instruction ==
+                   copy_value_node->value->defining_instruction();
+          });
       CHECK(it != operand_node->uses.end());
       operand_node->uses.erase(it);
 
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index e4e9d7ba05c115be9dd0eb53ebd7de208d514efb..4d4074943e3bf9f6f2a37abc63f037c2dab06e0f 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -1376,9 +1376,11 @@ TEST_F(CopyInsertionTest, CrossingParameters) {
   builder.AddInstruction(HloInstruction::CreateTuple({gte1, gte0}));
   module->AddEntryComputation(builder.Build());
   ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1}));
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   InsertCopies(module.get());
 
   EXPECT_EQ(CountCopies(*module), 4);
@@ -1409,9 +1411,11 @@ TEST_F(CopyInsertionTest, ParametersAliasing) {
   builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
   module->AddEntryComputation(builder.Build());
   ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1}));
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   InsertCopies(module.get());
 
   EXPECT_EQ(CountCopies(*module), 0);
@@ -1475,7 +1479,8 @@ TEST_F(CopyInsertionTest, ParameterWithPartialAliasing) {
   builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
   module->AddEntryComputation(builder.Build());
   ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   InsertCopies(module.get());
 
   EXPECT_THAT(module->entry_computation()->root_instruction(),
@@ -1516,7 +1521,8 @@ TEST_F(CopyInsertionTest, ParameterAndParallelOpsWithPartialAliasing) {
   builder.AddInstruction(HloInstruction::CreateTuple({negate0, negate1}));
   module->AddEntryComputation(builder.Build());
   ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   InsertCopies(module.get());
 
   EXPECT_EQ(CountCopies(*module), 0);
@@ -1557,7 +1563,8 @@ TEST_F(CopyInsertionTest, ParameterAndOpsWithPartialAliasing) {
   builder.AddInstruction(HloInstruction::CreateTuple({add, negate1}));
   module->AddEntryComputation(builder.Build());
   ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   InsertCopies(module.get());
 
   EXPECT_EQ(CountCopies(*module), 0);
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index f49b5110be5c4bab63b423e5ed2e67bc1828f6e3..a197bdddc88dafe9b1a5769da7bc1bdc54f84dc2 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -1,6 +1,14 @@
 # Description:
 #    LLVM-based CPU backend for XLA.
 
+load("//tensorflow/compiler/xla:xla.bzl", "ORC_JIT_MEMORY_MAPPER_TARGETS")
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "mkl_deps",
+)
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
+load(":build_defs.bzl", "runtime_copts")
+
 licenses(["notice"])  # Apache 2.0
 
 package(
@@ -14,15 +22,6 @@ package_group(
     ],
 )
 
-load(":build_defs.bzl", "runtime_copts")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
-load("//tensorflow/compiler/xla:xla.bzl", "ORC_JIT_MEMORY_MAPPER_TARGETS")
-load(
-    "//third_party/mkl:build_defs.bzl",
-    "mkl_deps",
-)
-
 # Filegroup used to collect source files for dependency checking.
 filegroup(
     name = "c_srcs",
@@ -114,6 +113,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:conditional_simplifier",
         "//tensorflow/compiler/xla/service:convolution_group_converter",
         "//tensorflow/compiler/xla/service:dot_decomposer",
+        "//tensorflow/compiler/xla/service:dynamic_index_splitter",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
         "//tensorflow/compiler/xla/service:hlo",
@@ -241,6 +241,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:tuple_points_to_analysis",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor/host:host_stream",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -364,15 +365,33 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tiled_dot_emitter",
+    srcs = ["tiled_dot_emitter.cc"],
+    hdrs = ["tiled_dot_emitter.h"],
+    deps = [
+        ":vector_support_library",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service/llvm_ir:kernel_support_library",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/core:lib",
+        "@llvm//:core",
+    ],
+)
+
 cc_library(
     name = "dot_op_emitter",
     srcs = ["dot_op_emitter.cc"],
-    hdrs = ["dot_op_emitter.h"],
+    hdrs = [
+        "dot_op_emitter.h",
+    ],
     deps = [
         ":cpu_options",
         ":cpu_runtime",
         ":ir_emission_utils",
         ":target_machine_features",
+        ":tiled_dot_emitter",
         ":vector_support_library",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -631,6 +650,7 @@ cc_library(
     deps = [
         ":runtime_matvec",
         "//tensorflow/core:framework_lite",
+        "//tensorflow/core/kernels:eigen_contraction_kernel",
         "//third_party/eigen3",
     ],
 )
@@ -1008,7 +1028,6 @@ tf_cc_test(
     size = "small",
     srcs = ["cpu_eigen_tensor_alignment_test.cc"],
     deps = [
-        ":dot_op_emitter",
         ":ir_emission_utils",
         ":target_machine_features_fake",
         "//tensorflow/compiler/xla:test",
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index dbab839aee95b249bd993e7a0a63fd3af38f8537..a6d92ce10ee311704eb4e40cee1d14fa6fdb1e57 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -69,6 +69,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/dot_decomposer.h"
+#include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -244,6 +245,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   HloPassPipeline pipeline("HLO passes through layout assignment");
   pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
                                             /*allow_mixed_precision=*/false);
+  pipeline.AddPass<DynamicIndexSplitter>();
   pipeline.AddPass<CpuHloSupportChecker>();
 
   ReducePrecisionInsertion::AddPasses(
@@ -302,7 +304,8 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   pipeline.AddPass<TransposeFolding>(
       [&](const HloInstruction& dot,
           const TransposeFolding::OperandIndices& candidate_operands) {
-        return PotentiallyImplementedAsEigenDot(dot, *target_machine_features)
+        return DotImplementationCanHandleTranspose(dot,
+                                                   *target_machine_features)
                    ? candidate_operands
                    : TransposeFolding::OperandIndices{};
       },
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc
index 8727c72b6e42517b1859e98ecadb41bbceed761c..485769a373acf5ae70c471b1a5dfcfb20ff772ef 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/cpu/dot_op_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
@@ -28,37 +27,6 @@ namespace {
 
 class CpuEigenTensorAlignmentTest : public ::testing::Test {};
 
-TEST_F(CpuEigenTensorAlignmentTest, EigenDotAlignment) {
-  string hlo_string = R"(
-HloModule DotOperation
-
-ENTRY DotOperation {
-  arg0 = f32[5,256] parameter(0)
-  arg1 = f32[256,1024] parameter(1)
-  ROOT dot = f32[5,1024] dot(arg0, arg1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseHloString(hlo_string));
-
-  HloInstruction* dot = module->entry_computation()->root_instruction();
-
-  TargetMachineFeaturesWithFakeAlignmentLogic target_machine_with_no_alignment(
-      [](int64 size) { return 1; });
-
-  EXPECT_FALSE(
-      PotentiallyImplementedAsEigenDot(*dot, target_machine_with_no_alignment));
-
-  TargetMachineFeaturesWithFakeAlignmentLogic
-      target_machine_with_full_alignment([](int64 size) {
-        return TargetMachineFeatures::kEigenExpectedTensorAlignment;
-      });
-
-  EXPECT_TRUE(PotentiallyImplementedAsEigenDot(
-      *dot, target_machine_with_full_alignment));
-}
-
 TEST_F(CpuEigenTensorAlignmentTest, EigenConvAlignment) {
   string hlo_string = R"(
 HloModule ConvOperation
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
index 527df0bd1c23bba74f32226e5622fed32f7dcf84..46c1d4c38e01b6d3b79699fc0cdc73868c974353 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
@@ -332,7 +332,7 @@ TEST_F(OpcodeFusionTest, Exponential_Reshape_Negate) {
 TEST_F(OpcodeFusionTest, Broadcast_Reshape_DynamicSlice_Tanh) {
   HloComputation::Builder builder(TestName());
   Shape param_shape = ShapeUtil::MakeShape(F32, {8});
-  Shape starts_shape = ShapeUtil::MakeShape(F32, {2});
+  Shape starts_shape = ShapeUtil::MakeShape(F32, {});
   Shape broadcast_shape = ShapeUtil::MakeShape(F32, {1, 8, 8});
   Shape reshape_shape = ShapeUtil::MakeShape(F32, {8, 8});
   Shape dynamic_slice_shape = ShapeUtil::MakeShape(F32, {4, 4});
@@ -340,13 +340,15 @@ TEST_F(OpcodeFusionTest, Broadcast_Reshape_DynamicSlice_Tanh) {
       HloInstruction::CreateParameter(0, param_shape, "param"));
   HloInstruction* param1 = builder.AddInstruction(
       HloInstruction::CreateParameter(1, starts_shape, "starts"));
+  HloInstruction* param2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, starts_shape, "starts"));
   HloInstruction* broadcast2 = builder.AddInstruction(
       HloInstruction::CreateBroadcast(broadcast_shape, param0, {1}));
   HloInstruction* reshape3 = builder.AddInstruction(
       HloInstruction::CreateReshape(reshape_shape, broadcast2));
   HloInstruction* dynamic_slice4 =
       builder.AddInstruction(HloInstruction::CreateDynamicSlice(
-          dynamic_slice_shape, reshape3, param1, {4, 4}));
+          dynamic_slice_shape, reshape3, {param1, param2}, {4, 4}));
   builder.AddInstruction(HloInstruction::CreateUnary(
       dynamic_slice_shape, HloOpcode::kTanh, dynamic_slice4));
 
@@ -356,7 +358,8 @@ TEST_F(OpcodeFusionTest, Broadcast_Reshape_DynamicSlice_Tanh) {
   RunFusionAndCheckOpcodesWereFused(
       module.get(),
       {HloOpcode::kTanh, HloOpcode::kDynamicSlice, HloOpcode::kReshape,
-       HloOpcode::kBroadcast, HloOpcode::kParameter, HloOpcode::kParameter});
+       HloOpcode::kBroadcast, HloOpcode::kParameter, HloOpcode::kParameter,
+       HloOpcode::kParameter});
 }
 
 TEST_F(OpcodeFusionTest, Broadcast_Negate) {
@@ -381,14 +384,16 @@ TEST_F(OpcodeFusionTest, Broadcast_Negate) {
 TEST_F(OpcodeFusionTest, DynamicSlice_Negate) {
   HloComputation::Builder builder(TestName());
   Shape param_shape = ShapeUtil::MakeShape(F32, {4});
-  Shape slice_shape = ShapeUtil::MakeShape(F32, {1});
+  Shape slice_shape = ShapeUtil::MakeShape(F32, {});
   Shape result_shape = ShapeUtil::MakeShape(F32, {2});
   HloInstruction* param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, param_shape, "param"));
   HloInstruction* param1 = builder.AddInstruction(
       HloInstruction::CreateParameter(1, slice_shape, "starts"));
-  HloInstruction* dynamic_slice2 = builder.AddInstruction(
-      HloInstruction::CreateDynamicSlice(result_shape, param0, param1, {2}));
+  HloInstruction* dynamic_slice2 =
+      builder.AddInstruction(HloInstruction::CreateDynamicSlice(
+          result_shape, param0,
+          std::initializer_list<HloInstruction*>({param1}), {2}));
   builder.AddInstruction(HloInstruction::CreateUnary(
       result_shape, HloOpcode::kNegate, dynamic_slice2));
 
@@ -548,28 +553,36 @@ TEST_F(OpcodeFusionTest, DynamicSliceWithDynamicUpdateSlice) {
   Shape full_shape = ShapeUtil::MakeShape(F32, {10, 100, 1000});
   Shape slice_shape = ShapeUtil::MakeShape(F32, {10, 1, 1000});
 
+  std::vector<HloInstruction*> slice_indices, update_indices;
+  for (int i = 0; i < 3; ++i) {
+    slice_indices.push_back(
+        builder.AddInstruction(HloInstruction::CreateParameter(
+            1 + i, ShapeUtil::MakeShape(U32, {}), "slice_indices")));
+    update_indices.push_back(
+        builder.AddInstruction(HloInstruction::CreateParameter(
+            5 + i, ShapeUtil::MakeShape(U32, {}), "update_indices")));
+  }
   HloInstruction* slice =
       builder.AddInstruction(HloInstruction::CreateDynamicSlice(
           slice_shape,
           builder.AddInstruction(
               HloInstruction::CreateParameter(0, full_shape, "slice_from")),
-          builder.AddInstruction(HloInstruction::CreateParameter(
-              1, ShapeUtil::MakeShape(U32, {3}), "slice_indices")),
+          slice_indices,
           /*slice_sizes=*/{10, 1, 1000}));
 
   builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
       full_shape,
       builder.AddInstruction(
-          HloInstruction::CreateParameter(2, full_shape, "to_update")),
-      slice,
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          3, ShapeUtil::MakeShape(U32, {3}), "update_indices"))));
+          HloInstruction::CreateParameter(4, full_shape, "to_update")),
+      slice, update_indices));
 
   module->AddEntryComputation(builder.Build());
   RunFusionAndCheckOpcodesWereFused(
-      module.get(), {HloOpcode::kDynamicSlice, HloOpcode::kDynamicUpdateSlice,
-                     HloOpcode::kParameter, HloOpcode::kParameter,
-                     HloOpcode::kParameter, HloOpcode::kParameter});
+      module.get(),
+      {HloOpcode::kDynamicSlice, HloOpcode::kDynamicUpdateSlice,
+       HloOpcode::kParameter, HloOpcode::kParameter, HloOpcode::kParameter,
+       HloOpcode::kParameter, HloOpcode::kParameter, HloOpcode::kParameter,
+       HloOpcode::kParameter, HloOpcode::kParameter});
 }
 
 TEST_F(OpcodeFusionTest, MessOfFusibleNodes) {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
index 1cb154e9ca3bbfd9742df96316272e0fc653da36..95b8025f873c56bea063ff258d4abd6614257d85 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
@@ -46,8 +46,7 @@ static bool ShouldMakeAllUsersColMajor(const HloInstruction* instruction) {
   for (auto* user : instruction->users()) {
     optional<int64> operand_idx = ProfitableToMakeDotOperandColumnMajor(*user);
     if (!operand_idx || user->operand(*operand_idx) != instruction ||
-        std::count(user->operands().begin(), user->operands().end(),
-                   instruction) != 1) {
+        absl::c_count(user->operands(), instruction) != 1) {
       return false;
     }
   }
@@ -94,60 +93,38 @@ static Shape ColMajorShape(const Shape& old_shape) {
   return new_shape;
 }
 
+static bool OperandsAndResultMustHaveRowMajorLayout(
+    const HloInstruction& instr,
+    const TargetMachineFeatures& target_machine_features) {
+  if (instr.opcode() == HloOpcode::kConvolution) {
+    return PotentiallyImplementedAsEigenConvolution(instr,
+                                                    target_machine_features);
+  } else if (instr.opcode() == HloOpcode::kDot) {
+    return DotOperandsAndResultMustHaveRowMajorLayout(instr,
+                                                      target_machine_features);
+  }
+  return false;
+}
+
 Status CpuLayoutAssignment::AddBackendConstraints(
     LayoutConstraints* constraints) {
   ShouldMakeOperandColMajorCache cache;
 
   const HloComputation* computation = constraints->computation();
   for (auto* instruction : computation->instructions()) {
-    if (instruction->opcode() == HloOpcode::kConvolution &&
-        PotentiallyImplementedAsEigenConvolution(*instruction,
-                                                 target_machine_features_)) {
-      const HloInstruction* convolution = instruction;
-      const HloInstruction* lhs_instruction = convolution->operand(0);
-      const HloInstruction* rhs_instruction = convolution->operand(1);
-
-      // In order to implement `convolution` with Eigen convolution, the layouts
-      // of the input, filter, and output need to be row-major.
-      //
-      // These constraints are not hard constraints. Ideally, we should decide
-      // which layouts to choose according to some cost model.
-      Shape output_shape(RowMajorShape(convolution->shape()));
-      Shape input_shape(RowMajorShape(lhs_instruction->shape()));
-      Shape filter_shape(RowMajorShape(rhs_instruction->shape()));
-
-      // Set layouts of the instructions' shapes.
-      TF_RETURN_IF_ERROR(
-          constraints->SetOperandLayout(input_shape, convolution, 0));
-      TF_RETURN_IF_ERROR(
-          constraints->SetOperandLayout(filter_shape, convolution, 1));
-      TF_RETURN_IF_ERROR(
-          constraints->SetInstructionLayout(output_shape, convolution));
+    if (OperandsAndResultMustHaveRowMajorLayout(*instruction,
+                                                target_machine_features_)) {
+      TF_RETURN_IF_ERROR(constraints->SetInstructionLayout(
+          RowMajorShape(instruction->shape()), instruction));
+      for (int i = 0; i < instruction->operand_count(); i++) {
+        TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
+            RowMajorShape(instruction->operand(i)->shape()), instruction, i));
+      }
     } else if (optional<int64> op_idx =
                    ShouldMakeOperandColumnMajor(&cache, *instruction)) {
       const HloInstruction* op = instruction->operand(*op_idx);
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
           ColMajorShape(op->shape()), instruction, *op_idx));
-    } else if (PotentiallyImplementedAsEigenDot(*instruction,
-                                                target_machine_features_)) {
-      const HloInstruction* dot = instruction;
-      // In order to implement `dot` with Eigen dot, the layouts of the lhs,
-      // rhs, and output need to be row-major.
-      //
-      // These constraints are not hard constraints. Ideally, we should decide
-      // which layouts to choose according to some cost model.
-      Shape output_shape(RowMajorShape(dot->shape()));
-
-      const HloInstruction* lhs_instruction = dot->operand(0);
-      Shape lhs_shape(RowMajorShape(lhs_instruction->shape()));
-      TF_RETURN_IF_ERROR(constraints->SetOperandLayout(lhs_shape, dot, 0));
-
-      const HloInstruction* rhs_instruction = dot->operand(1);
-      Shape rhs_shape(RowMajorShape(rhs_instruction->shape()));
-      TF_RETURN_IF_ERROR(constraints->SetOperandLayout(rhs_shape, dot, 1));
-
-      // Set layouts of the instructions' shapes.
-      TF_RETURN_IF_ERROR(constraints->SetInstructionLayout(output_shape, dot));
     } else {
       for (int64 operand_no = 0; operand_no < instruction->operand_count();
            ++operand_no) {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
index 92debb83e33b1400a59e5eef0f90971392ab7b22..ff654c83d61e7cc09ac7839feccaf2bc9cb3c63c 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
@@ -23,8 +23,8 @@ namespace {
 
 const char* const kXlaOptimizeForSizeCpuOption = "xla_cpu_optimize_for_size";
 const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor";
-const char* const kXlaEnableExperimentalLlvmIrGemm =
-    "xla_enable_experimental_llvm_ir_gemm";
+const char* const kXlaForceEnableExperimentalLlvmIrGemm =
+    "xla_force_enable_experimental_llvm_ir_gemm";
 const char* const kLlvmIrGemmTileSize = "xla_llvm_ir_gemm_tile_size";
 
 }  // namespace
@@ -57,10 +57,10 @@ absl::optional<int64> LlvmIrGemvTilingFactor(const HloModuleConfig& config) {
   return absl::nullopt;
 }
 
-bool EnableExperimentalLlvmIrGemm(const HloModuleConfig& config) {
+bool ForceEnableExperimentalLlvmIrGemm(const HloModuleConfig& config) {
   const auto& extra_options_map =
       config.debug_options().xla_backend_extra_options();
-  return extra_options_map.count(kXlaEnableExperimentalLlvmIrGemm) > 0;
+  return extra_options_map.count(kXlaForceEnableExperimentalLlvmIrGemm) > 0;
 }
 
 static absl::string_view RemoveSuffix(absl::string_view str,
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.h b/tensorflow/compiler/xla/service/cpu/cpu_options.h
index 47c7eb13b6e4cc05a23f82b8d2a25249f4b82ac0..99e6702d14aed8ffb148adec2bdd02dbc7c3c7e3 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.h
@@ -26,7 +26,7 @@ namespace options {
 
 bool OptimizeForSizeRequested(const HloModuleConfig& config);
 bool VectorizedReduceDisabled(const HloModuleConfig& config);
-bool EnableExperimentalLlvmIrGemm(const HloModuleConfig& config);
+bool ForceEnableExperimentalLlvmIrGemm(const HloModuleConfig& config);
 absl::optional<int64> LlvmIrGemvTilingFactor(const HloModuleConfig& config);
 absl::optional<std::tuple<int64, int64, int64>> LlvmIrGemmTileSize(
     const HloModuleConfig& config);
diff --git a/tensorflow/compiler/xla/service/cpu/disassembler.cc b/tensorflow/compiler/xla/service/cpu/disassembler.cc
index 3ae64142cd7e32d3aa8d50870efaf94698c06440..c3c6847b7b77e2fb0470630815de9f5d7a6c5b9c 100644
--- a/tensorflow/compiler/xla/service/cpu/disassembler.cc
+++ b/tensorflow/compiler/xla/service/cpu/disassembler.cc
@@ -77,17 +77,16 @@ StatusOr<DisassemblerResult> Disassembler::DisassembleObjectFile(
     }
 
     // Sort the symbols in increasing address order.
-    std::sort(
-        symbols.begin(), symbols.end(),
-        [](const llvm::object::SymbolRef& a, const llvm::object::SymbolRef& b) {
-          // getAddress returns a Expected object. Assert there is no error
-          // before extracting the address.
-          llvm::Expected<uint64_t> a_address_or_error = a.getAddress();
-          CHECK(a_address_or_error);
-          llvm::Expected<uint64_t> b_address_or_error = b.getAddress();
-          CHECK(b_address_or_error);
-          return a_address_or_error.get() < b_address_or_error.get();
-        });
+    absl::c_sort(symbols, [](const llvm::object::SymbolRef& a,
+                             const llvm::object::SymbolRef& b) {
+      // getAddress returns a Expected object. Assert there is no error
+      // before extracting the address.
+      llvm::Expected<uint64_t> a_address_or_error = a.getAddress();
+      CHECK(a_address_or_error);
+      llvm::Expected<uint64_t> b_address_or_error = b.getAddress();
+      CHECK(b_address_or_error);
+      return a_address_or_error.get() < b_address_or_error.get();
+    });
 
     // Construct ArrayRef pointing to section contents.
     llvm::StringRef section_content_string;
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 1525a33af7a490f99762733a677d7913c480200d..b018e0cd462ecc8d7924ecc5e1ecc3d27c6fe2c6 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
+#include "tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/vector_support_library.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
@@ -41,931 +42,163 @@ namespace xla {
 using llvm_ir::SetToFirstInsertPoint;
 
 namespace cpu {
-
 namespace {
-// Provides tiled access to an in-memory rank 2 array.
-class MemoryTile {
- public:
-  // Constructs a MemoryTile that can operate on tiles consisting of
-  // `tile_size_along_major_dim` vectors from the matrix `matrix`, starting at
-  // `major_dim_offset` in the major dimension.  The tile size along the minor
-  // dimension is the vector size, and that is implicitly determined by `vsl`.
-  MemoryTile(VectorSupportLibrary* vsl, llvm::IRBuilder<>* b,
-             llvm::Value* matrix, int64 matrix_size_along_minor_dim,
-             llvm::Value* major_dim_offset, int64 tile_size_along_major_dim)
-      : vsl_(vsl), b_(b) {
-    pointers_.reserve(tile_size_along_major_dim);
-    for (int64 i = 0; i < tile_size_along_major_dim; i++) {
-      llvm::Value* total_offset =
-          b->CreateMul(b->getInt64(matrix_size_along_minor_dim),
-                       b->CreateAdd(b->getInt64(i), major_dim_offset));
-      pointers_.push_back(vsl_->ComputeOffsetPointer(matrix, total_offset));
-    }
-  }
-
-  // Load a tile consisting of `tile_size_along_major_dim` vectors from position
-  // {major: `major_dim_offset`, minor: `minor_dim_offset`}.
-  //
-  // Note: `major_dim_offset` is a parameter to the constructor.
-  std::vector<llvm::Value*> LoadTile(llvm::Value* minor_dim_offset) const {
-    std::vector<llvm::Value*> result;
-    result.reserve(pointers_.size());
-    for (const auto& pointer : pointers_) {
-      result.push_back(vsl_->LoadVector(pointer, minor_dim_offset));
-    }
-    return result;
-  }
-
-  // Stores `tile` to position {major: `major_dim_offset`, minor:
-  // `minor_dim_offset`}.
-  //
-  // Note: `major_dim_offset` is a parameter to the constructor.
-  void StoreTile(absl::Span<llvm::Value* const> tile,
-                 llvm::Value* minor_dim_offset) const {
-    CHECK_EQ(tile.size(), pointers_.size());
-    for (int64 i = 0; i < pointers_.size(); i++) {
-      vsl_->StoreVector(tile[i], pointers_[i], minor_dim_offset);
-    }
-  }
+// Returns true if we should call into multi-threaded Eigen routines.
+bool ShouldUseMultiThreadedEigen(const HloModuleConfig& config) {
+  return config.debug_options().xla_cpu_multi_thread_eigen();
+}
 
-  // Loads a tile of size [`tile_size_along_major_dim`,
-  // `tile_size_along_middle_dim`] from position {major: `major_dim_offset`,
-  // minor: `minor_dim_offset`} and then broadcasts each element into a vector
-  // of size vsl_.vector_size().  The (i,j)'th element of the return value is
-  // the (i,j)'th element in the tile broadcasted into an LLVM vector.
-  //
-  // Note: `major_dim_offset` is a parameter to the constructor.
-  std::vector<std::vector<llvm::Value*>> LoadBroadcastTile(
-      llvm::Value* minor_dim_offset, int64 tile_size_along_middle_dim) const {
-    std::vector<std::vector<llvm::Value*>> result;
-    result.resize(pointers_.size());
-    for (int64 i = 0; i < pointers_.size(); i++) {
-      for (int64 j = 0; j < tile_size_along_middle_dim; j++) {
-        result[i].push_back(vsl_->LoadBroadcast(
-            pointers_[i], b_->CreateAdd(minor_dim_offset, b_->getInt64(j))));
-      }
-    }
-    return result;
+// Represents a dot operation.  We use this in lieu of an `HloInstruction`
+// because we want to be able to create this for the "inner" dot operation in a
+// batch dot, for which there is no separate HLO instruction.
+struct DotInfo {
+  Shape lhs_shape;
+  Shape rhs_shape;
+  Shape result_shape;
+  DotDimensionNumbers dim_nums;
+
+  explicit DotInfo(const HloInstruction& instr) {
+    CHECK_EQ(instr.opcode(), HloOpcode::kDot);
+    lhs_shape = instr.operand(0)->shape();
+    rhs_shape = instr.operand(1)->shape();
+    result_shape = instr.shape();
+    dim_nums = instr.dot_dimension_numbers();
   }
-
- private:
-  VectorSupportLibrary* vsl_;
-  llvm::IRBuilder<>* b_;
-  std::vector<llvm::Value*> pointers_;
 };
 
-// The base class for the classes representing the GEMV emitter configurations.
-//
-// The IR emitted (modulo the LLVM values representing the input and output
-// buffers) by the row major and column major GEMV emitters should be a function
-// of their configuration.  This is important because their configuration is
-// used as a key to cache the generated IR.
-class GemvConfig {
- public:
-  // Mixin for convenience.
-  template <typename T>
-  struct User {
-   public:
-    PrimitiveType scalar_type() const {
-      return derived().config().scalar_type();
-    }
-    int64 tile_rows() const { return derived().config().tile_rows(); }
-    int64 tile_cols() const { return derived().config().tile_cols(); }
-    int64 m() const { return derived().config().m(); }
-    int64 k() const { return derived().config().k(); }
-    int64 has_addend() const { return derived().config().has_addend(); }
-
-   private:
-    const T& derived() const { return *static_cast<const T*>(this); }
-  };
-
-  PrimitiveType scalar_type() const { return scalar_type_; }
-  int64 tile_rows() const { return tile_rows_; }
-  int64 tile_cols() const { return tile_cols_; }
-  int64 m() const { return m_; }
-  int64 k() const { return k_; }
-  bool has_addend() const { return has_addend_; }
-
-  string GetCacheKey() const {
-    return absl::StrCat(name_, "_", PrimitiveType_Name(scalar_type()), "_",
-                        tile_rows(), "_", tile_cols(), "_", m(), "_", k(),
-                        has_addend() ? "_with_addend" : "");
-  }
-
- protected:
-  explicit GemvConfig(string name, PrimitiveType scalar_type, int64 tile_rows,
-                      int64 tile_cols, int64 m, int64 k, bool has_addend)
-      : name_(std::move(name)),
-        scalar_type_(scalar_type),
-        tile_rows_(tile_rows),
-        tile_cols_(tile_cols),
-        m_(m),
-        k_(k),
-        has_addend_(has_addend) {}
-
- private:
-  string name_;
-  PrimitiveType scalar_type_;
-  int64 tile_rows_;
-  int64 tile_cols_;
-  int64 m_;
-  int64 k_;
-  bool has_addend_;
+// Dictates how a dot operation is implemented.
+enum class DotImplementationStrategy {
+  // The dot operation is lowered into LLVM IR that implements a naive nested
+  // loop that computes the result one element at a time.  This is our
+  // "fallback"; we don't really want this to kick in for any non-trival dot
+  // operation.
+  kNaiveLlvmIr,
+
+  // The dot operation is lowered into LLVM IR that implements a tiled
+  // Matrix*Vector operation.  This strategy also allows fusing in a bias add
+  // into the dot.  The matrix can be row major or column major, both are
+  // supported.
+  kTiledLlvmIrGemv,
+
+  // The dot operation is lowered into LLVM IR that implemetns a tiled
+  // Matrix*Matrix operation.  No fusions are supported.  The two inputs
+  // and the output have to be row major.
+  kTiledLlvmIrGemm,
+
+  // The dot operation is lowered into a call into an Eigen routine.  No fusions
+  // are supported today.  The two inputs and the output have to be row major.
+  // However, we do allow transposing either the LHS or the RHS as part of the
+  // GEMM -- we expose this flexibility as flexibility in the contraction
+  // dimensions, but we can also see this as flexibility in the input layouts.
+  kEigen,
 };
 
-// Computes a dot product between "[M,K]{0,1} lhs" with a [K,1] vector (the
-// layout of the vector does not matter).  This implementation uses a tiling
-// scheme to improve performance.
-//
-// We logically separate the LHS matrix into four segments:
-//
-//   +----------------------+---+
-//   |                      |   |
-//   |                      |   |
-//   |         A            | B |
-//   |                      |   |
-//   |                      |   |
-//   |                      |   |
-//   +----------------------+---+
-//   |         C            | D |
-//   +----------------------+---+
-//
-// where A is the largest submatrix of the LHS that can be evenly dividied into
-// tiles.  For each tile in A, assuming tile_rows_ == tile_cols_ == 4, we have:
-//
-//   +---+---+---+---+       +--+--+--+--+
-//   |M00|M10|M20|M30|       |V0|V1|V2|V3|
-//   +---+---+---+---+       +--+--+--+--+
-//   |M01|M11|M21|M31| and   |V0|V1|V2|V3|
-//   +---+---+---+---+       +--+--+--+--+
-//   |M02|M12|M22|M32|       |V0|V1|V2|V3|
-//   +---+---+---+---+       +--+--+--+--+
-//   |M03|M13|M23|M33|       |V0|V1|V2|V3|
-//   +---+---+---+---+       +--+--+--+--+
-//
-// (Legend: rows are horizontal and columns are vertical; and each column is one
-// llvm::Value of a vector type)
-//
-// where:
-//
-//   a. The left tile is from the column major left matrix.
-//   b. The right tile is an elementwise broadcast of a [V0, V1, V2, V3]
-//      vector loaded from the RHS vector.
-//
-// As we iterate through the column dimension, we compute the change to the
-// result vector by an elementwise multiplication between the two tiles above
-// followed by a reduction along the major dimension:
-//
-//                     +-----------------------------------+
-//                     | M00*V0 + M10*V1 + M20*V2 + M30*V3 |
-//                     +-----------------------------------+
-//                     | M01*V0 + M11*V1 + M21*V2 + M31*V3 |
-// Result[R:R+4] +=    +-----------------------------------+
-//                     | M02*V0 + M12*V1 + M22*V2 + M32*V3 |
-//                     +-----------------------------------+
-//                     | M03*V0 + M13*V1 + M23*V2 + M33*V3 |
-//                     +-----------------------------------+
-//
-// Where R is the starting row for the tile.
-//
-// We have an inner epilogue loop to deal with the "C" submatrix and an outer
-// epilogue loop to deal with the B,D submarix.
-//
-// TODO(sanjoy): We should investigate if using gather loads and scatter stores
-// can be used here have the same inner loop for both column-major and row-major
-// matrix-vector products.
-class ColumnMajorMatrixVectorProductEmitter
-    : public GemvConfig::User<ColumnMajorMatrixVectorProductEmitter> {
- public:
-  class Config : public GemvConfig {
-   public:
-    explicit Config(PrimitiveType scalar_type, int64 tile_rows, int64 tile_cols,
-                    int64 m, int64 k, bool has_addend)
-        : GemvConfig(/*name=*/"col_major_gemv", scalar_type,
-                     /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols, /*m=*/m,
-                     /*k=*/k, /*has_addend=*/has_addend) {}
-  };
-
-  ColumnMajorMatrixVectorProductEmitter(const Config& config, llvm::Value* lhs,
-                                        llvm::Value* rhs, llvm::Value* addend,
-                                        llvm::Value* result,
-                                        llvm::IRBuilder<>* b)
-      : config_(config),
-        lhs_(lhs),
-        rhs_(rhs),
-        addend_(addend),
-        result_(result),
-        b_(b),
-        ksl_(b_),
-        vsl_(config.scalar_type(), /*vector_size=*/config.tile_rows(), b_, "") {
-    CHECK(tile_rows() > 0 && IsPowerOfTwo(static_cast<uint64>(tile_rows())));
-    CHECK(!has_addend() || addend != nullptr);
-  }
-
-  void Emit();
-
-  const Config& config() const { return config_; }
-
- private:
-  void EmitOuterLoopBody(llvm::Value* column, int64 column_count,
-                         bool is_first_column);
-
-  MemoryTile GetLhsMemoryTile(llvm::Value* column_start, int64 column_count) {
-    return MemoryTile(&vsl_, b_, /*matrix=*/lhs_,
-                      /*matrix_size_along_minor_dim=*/m(),
-                      /*major_dim_offset=*/column_start,
-                      /*tile_size_along_major_dim=*/column_count);
-  }
+// Returns the implementation strategy for a dot with the configuration
+// `dot_info`.
+DotImplementationStrategy GetDotImplementationStrategy(
+    const HloModuleConfig& config, const DotInfo& dot_info,
+    const TargetMachineFeatures& target_machine_features);
 
-  // Load a tile of values from the RHS.  For the RHS a "tile" is a contiguous
-  // sequence of `count` values, each one broadcasted to the vector width.
-  std::vector<llvm::Value*> LoadRhsTile(llvm::Value* offset, int64 count) {
-    llvm::Value* base_pointer = vsl_.ComputeOffsetPointer(rhs_, offset);
-    std::vector<llvm::Value*> result;
-    result.reserve(count);
-    for (int64 i = 0; i < count; i++) {
-      result.push_back(vsl_.LoadBroadcast(base_pointer, i));
-    }
-    return result;
-  }
-
-  void EmitInnerLoopTiled(MemoryTile* lhs_memory_tile,
-                          const std::vector<llvm::Value*>& rhs_tile,
-                          int64 columns, bool is_first_column);
-
-  void EmitInnerLoopEpilogue(llvm::Value* current_tile_col, int64 columns,
-                             bool is_first_tiled_column);
-
-  Config config_;
-  llvm::Value* lhs_;
-  llvm::Value* rhs_;
-  llvm::Value* addend_;
-  llvm::Value* result_;
-  llvm::IRBuilder<>* b_;
-  KernelSupportLibrary ksl_;
-  VectorSupportLibrary vsl_;
-};
-
-void ColumnMajorMatrixVectorProductEmitter::EmitOuterLoopBody(
-    llvm::Value* column, int64 column_count, bool is_first_column) {
-  MemoryTile lhs_memory_tile = GetLhsMemoryTile(/*column_start=*/column,
-                                                /*column_count=*/column_count);
-
-  std::vector<llvm::Value*> rhs_tile =
-      LoadRhsTile(column, /*count=*/column_count);
-  EmitInnerLoopTiled(&lhs_memory_tile, rhs_tile,
-                     /*columns=*/column_count, is_first_column);
-  EmitInnerLoopEpilogue(column, /*columns=*/column_count, is_first_column);
-}
-
-void ColumnMajorMatrixVectorProductEmitter::Emit() {
-  // See the comment on the class declaration for the algorithm used here.
-  int64 column_remainder = k() % tile_cols();
-  int64 column_limit = k() - column_remainder;
-
-  ksl_.For("dot.outer.tiled",
-           /*start=*/0, /*end=*/column_limit, /*step=*/tile_cols(),
-           [&](llvm::Value* column, bool is_first_column) {
-             EmitOuterLoopBody(column, tile_cols(), is_first_column);
-           });
-
-  if (column_remainder != 0) {
-    EmitOuterLoopBody(b_->getInt64(column_limit), column_remainder,
-                      column_limit == 0);
-  }
-}
-
-void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
-    MemoryTile* lhs_memory_tile, const std::vector<llvm::Value*>& rhs_tile,
-    int64 columns, bool is_first_column) {
-  int64 row_limit = m() - (m() % tile_rows());
-
-  ksl_.For(
-      "dot.inner.tiled", /*start=*/0, /*end=*/row_limit,
-      /*step=*/tile_rows(), [&](llvm::Value* row) {
-        std::vector<llvm::Value*> lhs_tile =
-            lhs_memory_tile->LoadTile(/*minor_dim_offset=*/row);
-        llvm::Value* accumulator =
-            is_first_column ? (addend_ ? vsl_.LoadVector(addend_, row)
-                                       : vsl_.GetZeroVector())
-                            : vsl_.LoadVector(result_, row);
-        for (int i = 0; i < columns; i++) {
-          accumulator = vsl_.MulAdd(lhs_tile[i], rhs_tile[i], accumulator);
-        }
-        vsl_.StoreVector(accumulator, result_, row);
-      });
-}
-
-void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
-    llvm::Value* current_tile_col, int64 columns, bool is_first_tiled_column) {
-  int64 row_start = m() - (m() % tile_rows());
-  if (row_start == m()) {
-    return;
-  }
-
-  llvm::Value* columns_llvm = b_->getInt64(columns);
-
-  // for (col = current_tile_col; col < (columns + current_tile_col); col++)
-  //   for (row = row_start, row < m_; row++) {
-  //     result[row] += lhs[row, col] * rhs[col]
-  //     // Also take into account that if col is 0 then result[row] is not
-  //     // initialized.
-  //   }
-
-  ksl_.For(
-      "dot.inner.epilg.outer", /*start=*/current_tile_col,
-      /*end=*/b_->CreateAdd(columns_llvm, current_tile_col),
-      /*step=*/1, /*peel_first_iteration=*/false,
-      [&](llvm::Value* col, llvm::Value* is_first_scalar_col) {
-        llvm::Value* rhs_element = vsl_.LoadScalar(rhs_, col);
-        llvm::Value* total_offset = b_->CreateMul(col, b_->getInt64(m()));
-        llvm::Value* lhs_base_pointer =
-            vsl_.ComputeOffsetPointer(lhs_, total_offset);
-        ksl_.For(
-            "dot.inner.epilg.inner", /*start=*/row_start, /*end=*/m(),
-            /*step=*/1, [&](llvm::Value* scalar_row) {
-              llvm::Value* product = vsl_.Mul(
-                  vsl_.LoadScalar(lhs_base_pointer, scalar_row), rhs_element);
-              llvm::Value* setting_result_first_time = b_->CreateAnd(
-                  is_first_scalar_col, b_->getInt1(is_first_tiled_column));
-              ksl_.If(
-                  setting_result_first_time,
-                  /*true_block_generator=*/
-                  [&]() {
-                    if (addend_) {
-                      vsl_.StoreScalar(
-                          vsl_.Add(vsl_.LoadScalar(addend_, scalar_row),
-                                   product),
-                          result_, scalar_row);
-                    } else {
-                      vsl_.StoreScalar(product, result_, scalar_row);
-                    }
-                  },
-                  /*false_block_generator=*/
-                  [&]() {
-                    vsl_.StoreScalar(
-                        vsl_.Add(vsl_.LoadScalar(result_, scalar_row), product),
-                        result_, scalar_row);
-                  });
-            });
-      });
-}
-
-// Computes a dot product between "[M,K]{1,0} lhs" with a [K,1] vector (the
-// layout of the vector does not matter).  This implementation uses a tiling
-// scheme to improve performance.
-//
-// We logically separate the LHS matrix into four segments:
-//
-//   +----------------------+---+
-//   |                      |   |
-//   |                      |   |
-//   |         A            | B |
-//   |                      |   |
-//   |                      |   |
-//   |                      |   |
-//   +----------------------+---+
-//   |         C            | D |
-//   +----------------------+---+
-//
-// where A is the largest submatrix of the LHS that can be evenly dividied into
-// tiles.  For each tile in A, assuming tile_rows_ == tile_cols_ == 4, we have:
-//
-//   +---+---+---+---+
-//   |M00|M10|M20|M30|
-//   +---+---+---+---+       +--+--+--+--+
-//   |M01|M11|M21|M31| and   |V0|V1|V2|V3|
-//   +---+---+---+---+       +--+--+--+--+
-//   |M02|M12|M22|M32|
-//   +---+---+---+---+
-//   |M03|M13|M23|M33|
-//   +---+---+---+---+
-//
-// (Legend: rows are horizontal and columns are vertical; and each row is one
-// llvm::Value of a vector type)
-//
-// where:
-//
-//   a. The left tile is loaded from the row major left matrix.
-//   b. The right vector is loaded from the RHS vector.
-//
-// We keep 4 vector accumulators accumulating the following four vector
-// expressions as we iterate over the row dimension:
-//
-//   +------+------+------+------+
-//   |M0I*V0|M1I*V1|M2I*V2|M3I*V3|  for I in [0,4)
-//   +------+------+------+------+
-//
-// In the end we do a horizontal reduction over these 4 vector accumulators to
-// get 4 values in the result vector.
-//
-// We have an inner epilogue loop to deal with the "B" sub-matrix and an outer
-// epilogue loop to deal with the C,D submatrix.
-class RowMajorMatrixVectorProductEmitter
-    : public GemvConfig::User<RowMajorMatrixVectorProductEmitter> {
+// Helper class for emitting LLVM IR to perform the dot operation.
+class DotOpEmitter {
  public:
-  class Config : public GemvConfig {
-   public:
-    explicit Config(PrimitiveType scalar_type, int64 tile_rows, int64 tile_cols,
-                    int64 m, int64 k, bool has_addend)
-        : GemvConfig(/*name=*/"row_major_gemv", scalar_type,
-                     /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols, /*m=*/m,
-                     /*k=*/k, /*has_addend=*/has_addend) {}
-  };
-
-  RowMajorMatrixVectorProductEmitter(const Config& config, llvm::Value* lhs,
-                                     llvm::Value* rhs, llvm::Value* addend,
-                                     llvm::Value* result, llvm::IRBuilder<>* b)
-      : config_(config),
-        lhs_(lhs),
-        rhs_(rhs),
-        addend_(addend),
-        result_(result),
-        b_(b),
-        ksl_(b_),
-        vsl_(scalar_type(), /*vector_size=*/tile_cols(), b_, "") {
-    CHECK(tile_cols() > 0 && IsPowerOfTwo(static_cast<uint64>(tile_cols())));
-    CHECK(!has_addend() || addend != nullptr);
-  }
-
-  void Emit();
-
-  const Config& config() const { return config_; }
+  explicit DotOpEmitter(DotInfo dot_info, string dot_hlo_name,
+                        const llvm_ir::IrArray& target_array,
+                        const llvm_ir::IrArray& lhs_array,
+                        const llvm_ir::IrArray& rhs_array,
+                        const llvm_ir::IrArray* addend_array,
+                        llvm::Value* executable_run_options_value,
+                        llvm::IRBuilder<>* b,
+                        const HloModuleConfig& hlo_module_config,
+                        const TargetMachineFeatures& target_machine_features);
+
+  // Emits the IR to perform the dot operation.
+  Status Emit();
 
  private:
-  MemoryTile GetLhsMemoryTile(llvm::Value* row_start, int64 row_count) {
-    return MemoryTile(&vsl_, b_, /*matrix=*/lhs_,
-                      /*matrix_size_along_minor_dim=*/k(),
-                      /*major_dim_offset=*/row_start,
-                      /*tile_size_along_major_dim=*/row_count);
-  }
-
-  void EmitOuterLoopBody(llvm::Value* row, int64 row_count);
-
-  void EmitInnerLoopTiled(MemoryTile* lhs_memory_tile, int64 rows,
-                          std::vector<VectorVariable>* vector_accumulators);
-
-  void EmitInnerLoopEpilogue(llvm::Value* current_tile_row, int64 rows,
-                             std::vector<ScalarVariable>* scalar_accumulators);
-
-  Config config_;
-  llvm::Value* lhs_;
-  llvm::Value* rhs_;
-  llvm::Value* addend_;
-  llvm::Value* result_;
-  llvm::IRBuilder<>* b_;
-  KernelSupportLibrary ksl_;
-  VectorSupportLibrary vsl_;
-};
+  // Emits instructions to perform a scalar dot product (a multiply of the
+  // LHS and RHS) and store the results in the target.
+  Status EmitScalarDot();
 
-void RowMajorMatrixVectorProductEmitter::EmitOuterLoopBody(llvm::Value* row,
-                                                           int64 row_count) {
-  MemoryTile lhs_memory_tile = GetLhsMemoryTile(/*row_start=*/row,
-                                                /*row_count=*/row_count);
-  std::vector<VectorVariable> vector_accumulators;
-  std::vector<ScalarVariable> scalar_accumulators;
-  for (int i = 0; i < row_count; i++) {
-    vector_accumulators.emplace_back(&vsl_, vsl_.GetZeroVector());
-    scalar_accumulators.emplace_back(&vsl_, vsl_.GetZeroScalar());
-  }
-  EmitInnerLoopTiled(&lhs_memory_tile, /*rows=*/row_count,
-                     &vector_accumulators);
-  EmitInnerLoopEpilogue(/*current_tile_row=*/row, /*rows=*/row_count,
-                        &scalar_accumulators);
-
-  std::vector<llvm::Value*> accumulator_values;
-  std::transform(
-      vector_accumulators.begin(), vector_accumulators.end(),
-      std::back_inserter(accumulator_values),
-      [](const VectorVariable& vector_var) { return vector_var.Get(); });
-
-  std::vector<llvm::Value*> horizontal_sums;
-  if (row_count == vsl_.vector_size()) {
-    if (addend_) {
-      horizontal_sums = vsl_.ComputeHorizontalSums(
-          std::move(accumulator_values), vsl_.LoadVector(addend_, row));
-    } else {
-      horizontal_sums =
-          vsl_.ComputeHorizontalSums(std::move(accumulator_values));
-    }
-  } else {
-    horizontal_sums = vsl_.ComputeHorizontalSums(std::move(accumulator_values));
-  }
-
-  for (int i = 0; i < row_count; i++) {
-    llvm::Value* result_value =
-        vsl_.Add(horizontal_sums[i], scalar_accumulators[i].Get());
-    llvm::Value* offset = b_->CreateAdd(b_->getInt64(i), row);
-    if (addend_ && row_count != vsl_.vector_size()) {
-      result_value = vsl_.Add(vsl_.LoadScalar(addend_, offset), result_value);
-    }
-    vsl_.StoreScalar(result_value, result_, offset);
-  }
-}
+  // Emits a call to the CPU runtime to perform the matrix multiply.
+  Status EmitCallToRuntime();
 
-void RowMajorMatrixVectorProductEmitter::Emit() {
-  // See the comment on the class declaration for the algorithm used here.
-  int64 row_remainder = m() % tile_rows();
-  int64 row_limit = m() - row_remainder;
+  // Represents the dimensions of a matrix-matrix multiply operation.
+  struct MatMultDims {
+    // The number of rows in the LHS.
+    int64 m;
 
-  ksl_.For("dot.outer.tiled",
-           /*start=*/0, /*end=*/row_limit, /*step=*/tile_rows(),
-           [&](llvm::Value* row) { EmitOuterLoopBody(row, tile_rows()); });
-
-  if (row_remainder != 0) {
-    EmitOuterLoopBody(b_->getInt64(row_limit), row_remainder);
-  }
-}
-
-void RowMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
-    MemoryTile* lhs_memory_tile, int64 rows,
-    std::vector<VectorVariable>* vector_accumulators) {
-  int64 column_limit = k() - (k() % tile_cols());
-
-  ksl_.For("dot.inner.tiled", /*start=*/0, /*end=*/column_limit,
-           /*step=*/tile_cols(), [&](llvm::Value* col) {
-             std::vector<llvm::Value*> lhs_tile =
-                 lhs_memory_tile->LoadTile(/*minor_dim_offset=*/col);
-             llvm::Value* rhs_value = vsl_.LoadVector(rhs_, col);
-             for (int i = 0; i < rows; i++) {
-               llvm::Value* old_sum = (*vector_accumulators)[i].Get();
-               (*vector_accumulators)[i].Set(
-                   vsl_.Add(old_sum, vsl_.Mul(rhs_value, lhs_tile[i])));
-             }
-           });
-}
-
-void RowMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
-    llvm::Value* current_tile_row, int64 rows,
-    std::vector<ScalarVariable>* scalar_accumulators) {
-  int64 column_start = k() - (k() % tile_cols());
-  if (column_start == k()) {
-    return;
-  }
-
-  for (int r = 0; r < rows; r++) {
-    llvm::Value* total_offset = b_->CreateMul(
-        b_->CreateAdd(b_->getInt64(r), current_tile_row), b_->getInt64(k()));
-    llvm::Value* lhs_base_pointer =
-        vsl_.ComputeOffsetPointer(lhs_, total_offset);
-    ksl_.For(
-        "dot.inner.epilg.inner", /*start=*/column_start, /*end=*/k(),
-        /*step=*/1, [&](llvm::Value* scalar_col) {
-          llvm::Value* product =
-              vsl_.Mul(vsl_.LoadScalar(lhs_base_pointer, scalar_col),
-                       vsl_.LoadScalar(rhs_, scalar_col));
-          llvm::Value* old_value = (*scalar_accumulators)[r].Get();
-          (*scalar_accumulators)[r].Set(vsl_.Add(old_value, product));
-        });
-  }
-}
+    // The number of columns in the LHS, which is also must be equal to the
+    // number of rows in the RHS.
+    int64 k;
 
-// This class implements a tiled matrix multiplication algorithm, intended for
-// multiplying small matrices that don't need cache tiling.
-//
-// In the future this can be used as the innermost GEBP loop in a GEMM kernel as
-// described in "Goto, Kazushige, and Robert A. Geijn. "Anatomy of
-// high-performance matrix multiplication." ACM Transactions on Mathematical
-// Software (TOMS) 34.3 (2008): 12.".
-//
-// This only supports canonical dot operations (i.e. where the lhs contraction
-// dimension is 1 and the rhs contraction dimension is 0) over row major
-// matrices.
-class TiledSmallGemmEmitter {
- public:
-  // Describe the dimensions of the kernel.
-  class Dimensions {
-   public:
-    explicit Dimensions(int64 m, int64 k, int64 n) : m_(m), k_(k), n_(n) {}
+    // The number of columns on the RHS.
+    int64 n;
 
-    int64 m() const { return m_; }
-    int64 k() const { return k_; }
-    int64 n() const { return n_; }
+    // True if the LHS matrix is column major.
+    bool lhs_column_major;
 
-    string ToString() const { return absl::StrCat(m(), "x", k(), "x", n()); }
+    // True if the LHS contraction dimension is not 1.
+    bool lhs_non_canonical;
 
-   private:
-    const int64 m_;
-    const int64 k_;
-    const int64 n_;
-  };
+    // True if the RHS matrix is column major.
+    bool rhs_column_major;
 
-  // Represents the configuration of the emitter.  The LLVM IR emitted by the
-  // emitter, modulo the LLVM values holding the input and output buffers, must
-  // be a function of the instance of `Config` passed to it.
-  //
-  // `dims` holds the matrix multiplication dimensions.
-  //
-  // `max_vectorization_width` is the maximum vector width (i.e. the width of
-  // the largest vector register we will use).  This can be larger than the
-  // largest vector register supported by the machine -- LLVM will legalize
-  // these large vector widths into legally sized vectors.
-  //
-  // `max_vector_count` is the maximum number of vectors of size
-  // `max_vectorization_width` that we will attempt to process at once.
-  //
-  // `min_vectorization_width` is the smallest vector width the emitter will use
-  // -- below that it will devolve to using a scalar loop.
-  //
-  // The innermost reduction loop executes the matrix multiply in tiles of size
-  // [`tile_size_m`, `tile_size_k`] from the LHS and [`tile_size_k`,
-  // <vectorization width>] in the RHS.
-  class Config {
-   public:
-    explicit Config(PrimitiveType scalar_type, Dimensions dims,
-                    int64 max_vectorization_width, int64 max_vector_count,
-                    int64 min_vectorization_width, int64 tile_size_m,
-                    int64 tile_size_k)
-        : scalar_type_(scalar_type),
-          dims_(dims),
-          max_vectorization_width_(max_vectorization_width),
-          max_vector_count_(max_vector_count),
-          min_vectorization_width_(min_vectorization_width),
-          tile_size_m_(tile_size_m),
-          tile_size_k_(tile_size_k) {}
-
-    string GetCacheKey() const {
-      return absl::StrCat("gemm_", PrimitiveType_Name(scalar_type()), "_",
-                          dims().ToString(), "_", max_vectorization_width(),
-                          "_", min_vectorization_width(), "_", tile_size_m(),
-                          "_", tile_size_k());
-    }
+    // True if the RHS contraction dimension is not 0.
+    bool rhs_non_canonical;
 
-    PrimitiveType scalar_type() const { return scalar_type_; }
-    Dimensions dims() const { return dims_; }
-    int64 max_vectorization_width() const { return max_vectorization_width_; }
-    int64 max_vector_count() const { return max_vector_count_; }
-    int64 min_vectorization_width() const { return min_vectorization_width_; }
-
-    int64 tile_size_m() const { return tile_size_m_; }
-    int64 tile_size_k() const { return tile_size_k_; }
-
-   private:
-    PrimitiveType scalar_type_;
-    Dimensions dims_;
-    int64 max_vectorization_width_;
-    int64 max_vector_count_;
-    int64 min_vectorization_width_;
-    int64 tile_size_m_;
-    int64 tile_size_k_;
+    // True if the result matrix is column major.
+    bool target_column_major;
   };
 
-  // Creates an instance of TiledSmallGemmEmitter that matrix-multiplies
-  // `lhs` with `rhs` and stores the result in `result`.
-  explicit TiledSmallGemmEmitter(Config config, llvm::Value* lhs,
-                                 llvm::Value* rhs, llvm::Value* result,
-                                 llvm::IRBuilder<>* b)
-      : lhs_(lhs),
-        rhs_(rhs),
-        result_(result),
-        config_(config),
-        b_(b),
-        ksl_(b_) {
-    CHECK(max_vectorization_width() > 0 &&
-          IsPowerOfTwo(static_cast<uint64>(max_vectorization_width())));
-    CHECK_GT(max_vector_count(), 0);
-    CHECK(min_vectorization_width() > 0 &&
-          IsPowerOfTwo(static_cast<uint64>(min_vectorization_width())));
-    CHECK_GE(max_vectorization_width(), min_vectorization_width());
-    CHECK_GT(tile_size_k(), 0);
-  }
-
-  void Emit();
-
- private:
-  // The HandleResiduesOnX helpers split the iteration space for dimension X
-  // into a multiple of the tile size on dimension X and an epilogue.  These
-  // helpers ultimately call into `EmitTiledGemm` for emitting the
-  // tiled GEMM kernel.
-
-  void HandleResiduesOnN();
-  void HandleResiduesOnK(VectorSupportLibrary* vsl, llvm::Value* n_start,
-                         llvm::Value* n_end);
-  void HandleResiduesOnM(VectorSupportLibrary* vsl, int64 tile_size_k,
-                         llvm::Value* k_start, llvm::Value* k_end,
-                         llvm::Value* n_start, llvm::Value* n_end);
-
-  // This emits a tiled GEMM kernel.  For a detailed description see the comment
-  // on the implementation.
-  void EmitTiledGemm(VectorSupportLibrary* vsl, int64 tile_size_k,
-                     llvm::Value* k_start, llvm::Value* k_end,
-                     llvm::Value* n_start, llvm::Value* n_end,
-                     int64 tile_size_m, llvm::Value* m_start,
-                     llvm::Value* m_end);
-
-  llvm::Value* GetInt64(int64 value) { return b_->getInt64(value); }
-
-  Config config() const { return config_; }
-  Dimensions dims() const { return config().dims(); }
-
-  int64 max_vectorization_width() const {
-    return config().max_vectorization_width();
-  }
-  int64 max_vector_count() const { return config().max_vector_count(); }
-  int64 min_vectorization_width() const {
-    return config().min_vectorization_width();
-  }
-  int64 tile_size_m() const { return config().tile_size_m(); }
-  int64 tile_size_k() const { return config().tile_size_k(); }
-  PrimitiveType scalar_type() const { return config().scalar_type(); }
-
-  llvm::Value* lhs_;
-  llvm::Value* rhs_;
-  llvm::Value* result_;
-  Config config_;
-
+  // Get the MatMultDims instance for the dot product this DotOpEmitter
+  // represents.  Precondition: the dot is of rank 2 (and thus its operands are
+  // of rank 2 as well).
+  MatMultDims GetMatMultDims() const;
+
+  // Lowers the dot operation as a tiled Matrix*Vector loop.
+  void EmitTiledLlvmIrGemv();
+
+  // Lowers the dot operation as a tiled Matrix*Matrix loop.
+  void EmitTiledLlvmIrGemm();
+
+  // Lowers the dot operation as a naive nested loop that computes the result
+  // one element at a time.
+  void EmitNaiveLlvmIrGemm();
+
+  // When doing a tiled GEMV in LLVM IR, a "tile" consists of this many vector
+  // registers.
+  int64 GetGemvTilingFactor() const {
+    const int64 kDefaultTilingFactor = 8;
+    return options::LlvmIrGemvTilingFactor(hlo_module_config_)
+        .value_or(kDefaultTilingFactor);
+  }
+
+  std::tuple<int64, int64, int64> GetGemmTileSize() const {
+    // Tuned for broadwell - Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
+    //
+    // TODO(b/80093688): Tune for other architectures and centralize this
+    // information in one place.
+    const std::tuple<int64, int64, int64> kDefaultTileSize =
+        std::tuple<int64, int64, int64>(11, 9, 1);
+    return options::LlvmIrGemmTileSize(hlo_module_config_)
+        .value_or(kDefaultTileSize);
+  }
+
+  DotInfo dot_info_;
+  string dot_hlo_name_;
+  const llvm_ir::IrArray& target_array_;
+  const llvm_ir::IrArray& lhs_array_;
+  const llvm_ir::IrArray& rhs_array_;
+  const llvm_ir::IrArray* addend_array_;
+  llvm::Value* executable_run_options_value_;
   llvm::IRBuilder<>* b_;
-  KernelSupportLibrary ksl_;
+  const HloModuleConfig& hlo_module_config_;
+  const TargetMachineFeatures& target_machine_features_;
 };
-
-void TiledSmallGemmEmitter::Emit() { HandleResiduesOnN(); }
-
-void TiledSmallGemmEmitter::HandleResiduesOnN() {
-  // We can only iterate the `n` dimension for an extent that is divisible by
-  // the vectorization width.  So we emit an outer loop that first processes the
-  // largest extent in `n` that is divisible by max_vectorization_width, then
-  // the largest remaining extent that is divisible by max_vectorization_width /
-  // 2 etc.
-
-  int64 current_vectorization_width =
-      max_vector_count() * max_vectorization_width();
-  int64 current_vector_count = max_vector_count();
-
-  int64 n_start = 0;
-  while (n_start != dims().n() &&
-         current_vectorization_width >= min_vectorization_width()) {
-    int64 n_end = dims().n() - (dims().n() % current_vectorization_width);
-    if (n_start != n_end) {
-      VectorSupportLibrary vsl(scalar_type(), current_vectorization_width, b_,
-                               "gemm");
-      HandleResiduesOnK(&vsl, GetInt64(n_start), GetInt64(n_end));
-      n_start = n_end;
-    }
-    if (current_vector_count == 1) {
-      current_vectorization_width /= 2;
-    } else {
-      current_vector_count--;
-      current_vectorization_width =
-          current_vector_count * max_vectorization_width();
-    }
-  }
-
-  if (n_start != dims().n()) {
-    VectorSupportLibrary vsl(scalar_type(), 1, b_, "gemm");
-    ksl_.For("epi.n", n_start, dims().n(), 1, [&](llvm::Value* n_i) {
-      llvm::Value* n_i_next = b_->CreateAdd(n_i, b_->getInt64(1));
-      HandleResiduesOnK(&vsl, n_i, n_i_next);
-    });
-  }
-}
-
-void TiledSmallGemmEmitter::HandleResiduesOnK(VectorSupportLibrary* vsl,
-                                              llvm::Value* n_start,
-                                              llvm::Value* n_end) {
-  int64 k_start = 0;
-  int64 k_end = dims().k() - (dims().k() % tile_size_k());
-  if (k_end != k_start) {
-    HandleResiduesOnM(vsl, tile_size_k(), GetInt64(k_start), GetInt64(k_end),
-                      n_start, n_end);
-    k_start = k_end;
-  }
-
-  if (k_start != dims().k()) {
-    HandleResiduesOnM(vsl, dims().k() - k_start, GetInt64(k_start),
-                      GetInt64(dims().k()), n_start, n_end);
-  }
-}
-
-void TiledSmallGemmEmitter::HandleResiduesOnM(
-    VectorSupportLibrary* vsl, int64 tile_size_k, llvm::Value* k_start,
-    llvm::Value* k_end, llvm::Value* n_start, llvm::Value* n_end) {
-  const int64 m_end = dims().m() - dims().m() % tile_size_m();
-  EmitTiledGemm(vsl, tile_size_k, k_start, k_end, n_start, n_end, tile_size_m(),
-                GetInt64(0), GetInt64(m_end));
-
-  if (m_end != dims().m()) {
-    EmitTiledGemm(vsl, tile_size_k, k_start, k_end, n_start, n_end,
-                  dims().m() - m_end, GetInt64(m_end), GetInt64(dims().m()));
-  }
-}
-
-// The loop structure is:
-//
-// Iterate over dimension M as m:
-//   Iterate over dimension N as n:
-//     Iterate over dimension K as k:
-//       OutputTile[m,n] += Dot(LhsTile[m,k], RhsTile[k,n])
-//
-// I.e. a just a tiled version of a "naive" GEMM.
-//
-// The tiling scheme is as follows:
-//
-// Let the LHS be:
-//
-//   +----+----+----+
-//   | a0 | b0 | c0 | .
-//   +----+----+----+ .
-//   | a1 | b1 | c1 | .
-//   +----+----+----+
-//     ..     ..
-//
-// and the RHS be:
-//
-//   +----+----+----+----+
-//   | p0 | p1 | p2 | p3 | .
-//   +----+----+----+----+ .
-//   | q0 | q1 | q2 | q3 | .
-//   +----+----+----+----+
-//   | r0 | r1 | r2 | r3 | .
-//   +----+----+----+----+ .
-//     ......    ......
-//
-// and let tile_size_m=2, tile_size_k=3 and the vector width (implicitly denoted
-// by `vsl`) be 4.  Then we want to matrix multiply this tile to get a [2,4]
-// matrix that we can increment the result matrix by.
-//
-// First broadcast the rows row in LHS to 3 vectors of width 4, giving us a rank
-// 3 array, L, of dimension [2,3,4]:
-//
-//       L[0,_,_]           *      L[1,_,_]
-//                          *
-//   +----+----+----+----+  *  +----+----+----+----+
-//   | a0 | a0 | a0 | a0 |  *  | a1 | a1 | a1 | a1 |
-//   +----+----+----+----+  *  +----+----+----+----+
-//   | b0 | b0 | b0 | b0 |  *  | b1 | b1 | b1 | b1 |
-//   +----+----+----+----+  *  +----+----+----+----+
-//   | c0 | c0 | c0 | c0 |  *  | c1 | c1 | c1 | c1 |
-//   +----+----+----+----+  *  +----+----+----+----+
-//
-//
-// Then we FMA L[0,_,_] with the RHS to get the first row of the result and
-// L[1,_,_] with the RHS to get the second row of the result.  For example,
-// L[0,_,_] is computed as:
-//
-//   +----+----+----+----+   +----+----+----+----+
-//   | a0 | a0 | a0 | a0 | * | p0 | p1 | p2 | p3 |   +
-//   +----+----+----+----+   +----+----+----+----+
-//
-//   +----+----+----+----+   +----+----+----+----+
-//   | b0 | b0 | b0 | b0 | * | q0 | q1 | q2 | q3 |   +
-//   +----+----+----+----+   +----+----+----+----+
-//
-//   +----+----+----+----+   +----+----+----+----+
-//   | c0 | c0 | c0 | c0 | * | r0 | r1 | r2 | r3 |
-//   +----+----+----+----+   +----+----+----+----+
-//
-// to get:
-//
-//   +-------------------+-------------------+-------------------+---------
-//   | a0*p0+b0*q0+c0*r0 | a0*p1+b0*q1+c0*r1 | a0*p2+b0*q2+c0*r2 |  ...
-//   +-------------------+-------------------+-------------------+---------
-void TiledSmallGemmEmitter::EmitTiledGemm(
-    VectorSupportLibrary* vsl, int64 tile_size_k, llvm::Value* k_start,
-    llvm::Value* k_end, llvm::Value* n_start, llvm::Value* n_end,
-    int64 tile_size_m, llvm::Value* m_start, llvm::Value* m_end) {
-  ksl_.For(
-      "dot.m", m_start, m_end, tile_size_m, [&](llvm::Value* m_i) {
-        MemoryTile result_memory_tile(
-            vsl, b_, /*matrix=*/result_,
-            /*matrix_size_along_minor_dim=*/dims().n(),
-            /*major_dim_offset=*/m_i,
-            /*tile_size_along_major_dim=*/tile_size_m);
-        MemoryTile lhs_memory_tile(vsl, b_, /*matrix=*/lhs_,
-                                   /*matrix_size_along_minor_dim=*/dims().k(),
-                                   /*major_dim_offset=*/m_i,
-                                   /*tile_size_along_major_dim=*/tile_size_m);
-        ksl_.For(
-            "dot.n", n_start, n_end, vsl->vector_size(), [&](llvm::Value* n_i) {
-              TileVariable result_tile_var(vsl,
-                                           result_memory_tile.LoadTile(n_i));
-              ksl_.For(
-                  "dot.k", k_start, k_end, tile_size_k, [&](llvm::Value* k_i) {
-                    MemoryTile rhs_memory_tile(vsl, b_, rhs_, dims().n(), k_i,
-                                               tile_size_k);
-                    std::vector<std::vector<llvm::Value*>> lhs_tile =
-                        lhs_memory_tile.LoadBroadcastTile(k_i, tile_size_k);
-                    std::vector<llvm::Value*> rhs_tile =
-                        rhs_memory_tile.LoadTile(n_i);
-                    std::vector<llvm::Value*> result_tile =
-                        result_tile_var.Get();
-                    for (int64 r_m_i = 0; r_m_i < tile_size_m; r_m_i++) {
-                      for (int64 r_k_i = 0; r_k_i < tile_size_k; r_k_i++) {
-                        result_tile[r_m_i] =
-                            vsl->MulAdd(lhs_tile[r_m_i][r_k_i], rhs_tile[r_k_i],
-                                        result_tile[r_m_i]);
-                      }
-                    }
-                    result_tile_var.Set(result_tile);
-                  });
-
-              result_memory_tile.StoreTile(result_tile_var.Get(), n_i);
-            });
-      });
-}
-
 }  // namespace
 
-DotOpEmitter::DotOpEmitter(const HloInstruction& dot,
+DotOpEmitter::DotOpEmitter(DotInfo dot_info, string dot_hlo_name,
                            const llvm_ir::IrArray& target_array,
                            const llvm_ir::IrArray& lhs_array,
                            const llvm_ir::IrArray& rhs_array,
@@ -974,7 +207,8 @@ DotOpEmitter::DotOpEmitter(const HloInstruction& dot,
                            llvm::IRBuilder<>* b,
                            const HloModuleConfig& hlo_module_config,
                            const TargetMachineFeatures& target_machine_features)
-    : dot_(dot),
+    : dot_info_(std::move(dot_info)),
+      dot_hlo_name_(std::move(dot_hlo_name)),
       target_array_(target_array),
       lhs_array_(lhs_array),
       rhs_array_(rhs_array),
@@ -984,58 +218,9 @@ DotOpEmitter::DotOpEmitter(const HloInstruction& dot,
       hlo_module_config_(hlo_module_config),
       target_machine_features_(target_machine_features) {}
 
-/* static */ Status DotOpEmitter::EmitDotOperation(
-    const HloInstruction& dot, const llvm_ir::IrArray& target_array,
-    const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
-    const llvm_ir::IrArray* addend_array,
-    llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
-    const HloModuleConfig& hlo_module_config,
-    const TargetMachineFeatures& target_machine_features) {
-  PrimitiveType type = target_array.GetShape().element_type();
-  TF_RET_CHECK(F16 == type || F32 == type || F64 == type || C64 == type);
-  DotOpEmitter dot_emitter(dot, target_array, lhs_array, rhs_array,
-                           addend_array, executable_run_options_value, b,
-                           hlo_module_config, target_machine_features);
-  return dot_emitter.Emit();
-}
-
-bool DotOpEmitter::EmitSmallGemmIfProfitable(
-    const DotOpEmitter::MatMultDims& mat_mult_dims) {
-  if (ShouldUseMultiThreadedEigen()) {
-    return false;
-  }
-
-  if (!EnableExperimentalLlvmIrGemm()) {
-    // TODO(sanjoy):  We should make these numbers micro-arch specific.
-    bool small_gemm = mat_mult_dims.k <= 128 &&
-                      ((mat_mult_dims.m <= 32 && mat_mult_dims.n <= 128) ||
-                       (mat_mult_dims.m <= 128 && mat_mult_dims.n <= 32));
-    if (!small_gemm) {
-      return false;
-    }
-  }
-
-  if (mat_mult_dims.lhs_non_canonical || mat_mult_dims.rhs_non_canonical) {
-    return false;
-  }
-
-  PrimitiveType primitive_type = dot_.shape().element_type();
-
-  switch (primitive_type) {
-    default:
-      return false;
-
-    case F32:
-    case F64:
-    case S32:
-    case S64:
-      break;
-  }
-
-  if (!(mat_mult_dims.lhs_column_major == mat_mult_dims.rhs_column_major &&
-        mat_mult_dims.rhs_column_major == mat_mult_dims.target_column_major)) {
-    return false;
-  }
+void DotOpEmitter::EmitTiledLlvmIrGemm() {
+  PrimitiveType primitive_type = dot_info_.result_shape.element_type();
+  MatMultDims mat_mult_dims = GetMatMultDims();
 
   llvm::Value* lhs = lhs_array_.GetBasePointer();
   llvm::Value* rhs = rhs_array_.GetBasePointer();
@@ -1050,9 +235,8 @@ bool DotOpEmitter::EmitSmallGemmIfProfitable(
   }
 
   int64 size_bytes = m * n * ShapeUtil::ByteSizeOfPrimitiveType(primitive_type);
-  b_->CreateMemSet(
-      target, b_->getInt8(0), size_bytes,
-      target_machine_features_.minimum_alignment_for_allocation(size_bytes));
+  b_->CreateMemSet(target, b_->getInt8(0), /*Size=*/size_bytes,
+                   /*Align=*/1);
 
   int64 max_target_vector_width =
       target_machine_features_.vector_register_num_elements(
@@ -1062,47 +246,28 @@ bool DotOpEmitter::EmitSmallGemmIfProfitable(
   std::tie(tile_size_m, tile_size_k, tile_size_n_in_vector_width) =
       GetGemmTileSize();
 
-  TiledSmallGemmEmitter::Config config(
-      /*scalar_type=*/primitive_type,
-      TiledSmallGemmEmitter::Dimensions{/*m=*/m, /*k=*/k, /*n=*/n},
-      /*max_vectorization_width=*/max_target_vector_width,
-      /*max_vector_count=*/tile_size_n_in_vector_width,
-      /*min_vectorization_width=*/std::min<int64>(4, max_target_vector_width),
-      /*tile_size_m=*/tile_size_m, /*tile_size_k=*/tile_size_k);
-
-  VLOG(2) << "Emitting GEMM kernel in LLVM IR with config "
-          << config.GetCacheKey();
-
   const bool enable_fast_math =
       hlo_module_config_.debug_options().xla_cpu_enable_fast_math();
   const bool optimize_for_size =
       options::OptimizeForSizeRequested(hlo_module_config_);
 
-  KernelSupportLibrary::EmitAndCallOutlinedKernel(
+  EmitSmallGemm(
+      /*scalar_type=*/primitive_type,
+      /*m=*/m, /*k=*/k, /*n=*/n,
+      /*max_vectorization_width=*/max_target_vector_width,
+      /*max_vector_count=*/tile_size_n_in_vector_width,
+      /*min_vectorization_width=*/std::min<int64>(4, max_target_vector_width),
+      /*tile_size_m=*/tile_size_m, /*tile_size_k=*/tile_size_k, /*lhs=*/lhs,
+      /*rhs=*/rhs, /*result=*/target, b_,
       /*enable_fast_math=*/enable_fast_math,
-      /*optimize_for_size=*/optimize_for_size, b_, config.GetCacheKey(), lhs,
-      rhs, target,
-      [this, config](llvm::Value* lhs, llvm::Value* rhs, llvm::Value* target) {
-        TiledSmallGemmEmitter small_gemm_emitter(config, /*lhs=*/lhs,
-                                                 /*rhs=*/rhs,
-                                                 /*result=*/target, b_);
-        small_gemm_emitter.Emit();
-      });
-
-  return true;
+      /*optimize_for_size=*/optimize_for_size);
 }
 
-bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
-  if (dot_.shape().dimensions_size() != 2) {
-    return false;
-  }
-
-  PrimitiveType primitive_type = dot_.shape().element_type();
+void DotOpEmitter::EmitTiledLlvmIrGemv() {
+  PrimitiveType primitive_type = dot_info_.result_shape.element_type();
 
-  if (!primitive_util::IsFloatingPointType(primitive_type) &&
-      !primitive_util::IsIntegralType(primitive_type)) {
-    return false;
-  }
+  CHECK(primitive_util::IsFloatingPointType(primitive_type) ||
+        primitive_util::IsIntegralType(primitive_type));
 
   MatMultDims mat_mult_dims = GetMatMultDims();
   bool is_column_major_matrix_vector = false;
@@ -1143,9 +308,7 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
     }
   }
 
-  if (!is_column_major_matrix_vector && !is_row_major_matrix_vector) {
-    return EmitSmallGemmIfProfitable(mat_mult_dims);
-  }
+  CHECK(is_column_major_matrix_vector || is_row_major_matrix_vector);
 
   int64 tiling_factor = GetGemvTilingFactor();
   CHECK_GT(tiling_factor, 0);
@@ -1177,44 +340,27 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
   if (is_column_major_matrix_vector) {
     VLOG(2) << "Emitting column major matrix-vector multiply with m = " << m
             << " and k = " << k;
-    ColumnMajorMatrixVectorProductEmitter::Config config(
+    EmitColumnMajorGemv(
         /*scalar_type=*/primitive_type,
         /*tile_rows=*/vector_register_element_size, /*tile_cols=*/tiling_factor,
-        /*m=*/m, /*k=*/k, /*has_addend=*/addend_array_ != nullptr);
-
-    KernelSupportLibrary::EmitAndCallOutlinedKernel(
+        /*m=*/m, /*k=*/k, /*lhs=*/lhs_op, /*rhs=*/rhs_op,
+        /*addend=*/addend_array_ ? addend_array_->GetBasePointer() : nullptr,
+        /*result=*/result_op, b_,
         /*enable_fast_math=*/enable_fast_math,
-        /*optimize_for_size=*/optimize_for_size, b_, config.GetCacheKey(),
-        lhs_op, rhs_op,
-        addend_array_ ? addend_array_->GetBasePointer() : nullptr, result_op,
-        [this, config](llvm::Value* lhs_op, llvm::Value* rhs_op,
-                       llvm::Value* addend_op, llvm::Value* result_op) {
-          ColumnMajorMatrixVectorProductEmitter emitter(
-              config, lhs_op, rhs_op, addend_op, result_op, b_);
-          emitter.Emit();
-        });
+        /*optimize_for_size=*/optimize_for_size);
   } else {
     VLOG(2) << "Emitting row major matrix-vector multiply with m = " << m
             << " and k = " << k;
-    RowMajorMatrixVectorProductEmitter::Config config(
+    EmitRowMajorGemv(
         /*scalar_type=*/primitive_type,
-        /*tile_rows=*/tiling_factor, /*tile_cols=*/vector_register_element_size,
-        /*m=*/m, /*k=*/k, /*has_addend=*/addend_array_ != nullptr);
-
-    KernelSupportLibrary::EmitAndCallOutlinedKernel(
+        /*tile_rows=*/tiling_factor,
+        /*tile_cols=*/vector_register_element_size,
+        /*m=*/m, /*k=*/k, /*lhs=*/lhs_op, /*rhs=*/rhs_op,
+        /*addend=*/addend_array_ ? addend_array_->GetBasePointer() : nullptr,
+        /*result=*/result_op, b_,
         /*enable_fast_math=*/enable_fast_math,
-        /*optimize_for_size=*/optimize_for_size, b_, config.GetCacheKey(),
-        lhs_op, rhs_op,
-        addend_array_ ? addend_array_->GetBasePointer() : nullptr, result_op,
-        [this, config](llvm::Value* lhs_op, llvm::Value* rhs_op,
-                       llvm::Value* addend_op, llvm::Value* result_op) {
-          RowMajorMatrixVectorProductEmitter emitter(config, lhs_op, rhs_op,
-                                                     addend_op, result_op, b_);
-          emitter.Emit();
-        });
+        /*optimize_for_size=*/optimize_for_size);
   }
-
-  return true;
 }
 
 Status DotOpEmitter::Emit() {
@@ -1240,11 +386,6 @@ Status DotOpEmitter::Emit() {
   // which performs the sum-of-products (the reduction loop) before storing
   // the result in the output buffer.
 
-  // This routine assumes that the dot operation is not in a parallelized
-  // enclosing computation.
-  CHECK(
-      dot_.parent()->root_instruction()->outer_dimension_partitions().empty());
-
   const Shape& lhs_shape = lhs_array_.GetShape();
   const Shape& rhs_shape = rhs_array_.GetShape();
 
@@ -1255,27 +396,41 @@ Status DotOpEmitter::Emit() {
     return EmitScalarDot();
   }
 
-  if (EmitLlvmIrDotIfProfitable()) {
-    return Status::OK();
+  switch (GetDotImplementationStrategy(hlo_module_config_, dot_info_,
+                                       target_machine_features_)) {
+    case DotImplementationStrategy::kNaiveLlvmIr:
+      EmitNaiveLlvmIrGemm();
+      return Status::OK();
+
+    case DotImplementationStrategy::kTiledLlvmIrGemv:
+      EmitTiledLlvmIrGemv();
+      return Status::OK();
+
+    case DotImplementationStrategy::kTiledLlvmIrGemm:
+      EmitTiledLlvmIrGemm();
+      return Status::OK();
+
+    case DotImplementationStrategy::kEigen:
+      return EmitCallToRuntime();
   }
+}
 
+void DotOpEmitter::EmitNaiveLlvmIrGemm() {
   CHECK_EQ(addend_array_, nullptr);
 
-  if (PotentiallyImplementedAsEigenDot(dot_, target_machine_features_)) {
-    return EmitCallToRuntime();
-  }
+  const Shape& lhs_shape = lhs_array_.GetShape();
+  const Shape& rhs_shape = rhs_array_.GetShape();
+  const DotDimensionNumbers& dim_nums = dot_info_.dim_nums;
 
   // Reduce along dimension 0 of the LHS and 1 of the RHS. Vectors are a special
   // case where the reduction dimension is 0 for both LHS and RHS. This results
   // in a vector dot product producing a scalar.
-  int64 lhs_reduction_dimension =
-      dot_.dot_dimension_numbers().lhs_contracting_dimensions(0);
-  int64 rhs_reduction_dimension =
-      dot_.dot_dimension_numbers().rhs_contracting_dimensions(0);
+  int64 lhs_reduction_dimension = dim_nums.lhs_contracting_dimensions(0);
+  int64 rhs_reduction_dimension = dim_nums.rhs_contracting_dimensions(0);
 
   // Verify the reduction dimension in the two operands are the same size.
-  TF_RET_CHECK(lhs_shape.dimensions(lhs_reduction_dimension) ==
-               rhs_shape.dimensions(rhs_reduction_dimension));
+  CHECK_EQ(lhs_shape.dimensions(lhs_reduction_dimension),
+           rhs_shape.dimensions(rhs_reduction_dimension));
 
   bool lhs_reduction_along_minor_dimension =
       lhs_reduction_dimension == LayoutUtil::Minor(lhs_shape.layout(), 0);
@@ -1285,7 +440,7 @@ Status DotOpEmitter::Emit() {
   // Create loop nests which loop through the LHS operand dimensions and the RHS
   // operand dimensions. The reduction dimension of the LHS and RHS are handled
   // in a separate innermost loop which performs the sum of products.
-  llvm_ir::ForLoopNest loop_nest(llvm_ir::IrName(&dot_), b_);
+  llvm_ir::ForLoopNest loop_nest(llvm_ir::IrName(dot_hlo_name_), b_);
   llvm_ir::IrArray::Index lhs_index = loop_nest.EmitOperandArrayLoopNest(
       lhs_array_, /*dimension_to_skip=*/lhs_reduction_dimension, "lhs");
   llvm_ir::IrArray::Index rhs_index = loop_nest.EmitOperandArrayLoopNest(
@@ -1390,8 +545,6 @@ Status DotOpEmitter::Emit() {
   // Set the IR builder insert point to the exit basic block of the outer most
   // loop.
   b_->SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock());
-
-  return Status::OK();
 }
 
 Status DotOpEmitter::EmitScalarDot() {
@@ -1438,7 +591,7 @@ Status DotOpEmitter::EmitCallToRuntime() {
   // The two transpose_... parameters are actually booleans, but we use int32
   // to avoid target-dependent calling convention details.
 
-  bool multi_threaded = ShouldUseMultiThreadedEigen();
+  bool multi_threaded = ShouldUseMultiThreadedEigen(hlo_module_config_);
   bool use_mkl_dnn = hlo_module_config_.debug_options().xla_cpu_use_mkl_dnn();
   PrimitiveType type = target_array_.GetShape().element_type();
   llvm::Type* float_type;
@@ -1531,11 +684,11 @@ Status DotOpEmitter::EmitCallToRuntime() {
 }
 
 DotOpEmitter::MatMultDims DotOpEmitter::GetMatMultDims() const {
-  CHECK_EQ(dot_.shape().dimensions_size(), 2);
+  CHECK_EQ(dot_info_.result_shape.dimensions_size(), 2);
 
   const Shape& lhs_shape = lhs_array_.GetShape();
   const Shape& rhs_shape = rhs_array_.GetShape();
-  const DotDimensionNumbers& dim_nums = dot_.dot_dimension_numbers();
+  const DotDimensionNumbers& dim_nums = dot_info_.dim_nums;
 
   return {
       /*m=*/lhs_shape.dimensions(1 - dim_nums.lhs_contracting_dimensions(0)),
@@ -1549,74 +702,6 @@ DotOpEmitter::MatMultDims DotOpEmitter::GetMatMultDims() const {
       LayoutUtil::Minor(target_array_.GetShape().layout(), 0) == 0};
 }
 
-// Return whether the given shape is rank 2.
-static bool IsRank2(const Shape& shape) { return shape.rank() == 2; }
-
-// In a gemm operation where output = lhs * rhs, check whether the given shapes
-// are valid for the operation.
-static bool AreValidGemmShapes(
-    const Shape& lhs_shape, const Shape& rhs_shape, const Shape& output_shape,
-    const TargetMachineFeatures& target_machine_features) {
-  // The inputs and the output must
-  // 1) be matrices with no padding, and
-  // 2) have an allowed element type.
-  PrimitiveType output_primitive_type = output_shape.element_type();
-  if (!(output_primitive_type == F64 || output_primitive_type == F32 ||
-        output_primitive_type == F16)) {
-    return false;
-  }
-
-  if (!(IsRank2(lhs_shape) && IsRank2(rhs_shape) && IsRank2(output_shape))) {
-    return false;
-  }
-
-  auto is_aligned = [&](const Shape& shape) {
-    return GetMinimumAlignmentForArray(shape, target_machine_features) >=
-           TargetMachineFeatures::kEigenExpectedTensorAlignment;
-  };
-
-  if (!is_aligned(lhs_shape) || !is_aligned(rhs_shape) ||
-      !is_aligned(output_shape)) {
-    return false;
-  }
-
-  return true;
-}
-
-bool PotentiallyImplementedAsEigenDot(
-    const HloInstruction& hlo,
-    const TargetMachineFeatures& target_machine_features) {
-  // For certain types of Dot, we can call Eigen
-  if (hlo.opcode() == HloOpcode::kDot) {
-    const Shape& lhs_shape = hlo.operand(0)->shape();
-    const Shape& rhs_shape = hlo.operand(1)->shape();
-
-    if (ShapeUtil::IsZeroElementArray(lhs_shape) ||
-        ShapeUtil::IsZeroElementArray(rhs_shape)) {
-      return false;
-    }
-
-    if (ProfitableToImplementDotInTiledLlvmIr(hlo)) {
-      return false;
-    }
-
-    // If gemm can accept the operand shapes, use it rather than a custom
-    // kernel.
-    if (AreValidGemmShapes(lhs_shape, rhs_shape, hlo.shape(),
-                           target_machine_features)) {
-      const DotDimensionNumbers& dim_numbers = hlo.dot_dimension_numbers();
-      // The size of the reduction dimension should match. The shape inference
-      // guarantees this invariant, so the check here is for programming
-      // errors.
-      CHECK_EQ(lhs_shape.dimensions(dim_numbers.lhs_contracting_dimensions(0)),
-               rhs_shape.dimensions(dim_numbers.rhs_contracting_dimensions(0)));
-      return true;
-    }
-  }
-
-  return false;
-}
-
 // For vector-matrix dot products, it is always profitable to make the Rhs
 // column major.
 absl::optional<int64> ProfitableToMakeDotOperandColumnMajor(
@@ -1655,16 +740,157 @@ absl::optional<int64> ProfitableToMakeDotOperandColumnMajor(
   return {};
 }
 
-bool ProfitableToImplementDotInTiledLlvmIr(const HloInstruction& dot) {
+namespace {
+// Return whether the given shape is rank 2.
+bool IsRank2(const Shape& shape) { return shape.rank() == 2; }
+
+bool IsSimpleLayout(const Layout& layout) {
+  return layout.tiles().empty() && layout.format() == DENSE;
+}
+
+// In a gemm operation where output = lhs * rhs, check whether the given shapes
+// are valid for the operation.
+bool AreGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape,
+                   const Shape& output_shape,
+                   const TargetMachineFeatures& target_machine_features) {
+  CHECK(!lhs_shape.has_layout() || IsSimpleLayout(lhs_shape.layout()))
+      << lhs_shape.DebugString();
+  CHECK(!rhs_shape.has_layout() || IsSimpleLayout(rhs_shape.layout()))
+      << rhs_shape.DebugString();
+  CHECK(!output_shape.has_layout() || IsSimpleLayout(output_shape.layout()))
+      << output_shape.DebugString();
+
+  switch (output_shape.element_type()) {
+    case F64:
+    case F32:
+    case F16:
+      return IsRank2(lhs_shape) && IsRank2(rhs_shape) && IsRank2(output_shape);
+    default:
+      return false;
+  }
+}
+
+bool IsAlignedGemm(const DotInfo& dot_info,
+                   const TargetMachineFeatures& target_machine_features) {
+  if (ShapeUtil::IsZeroElementArray(dot_info.lhs_shape) ||
+      ShapeUtil::IsZeroElementArray(dot_info.rhs_shape)) {
+    return false;
+  }
+
+  return AreGemmShapes(dot_info.lhs_shape, dot_info.rhs_shape,
+                       dot_info.result_shape, target_machine_features);
+}
+
+bool CanEmitTiledLlvmIrGemm(
+    const HloModuleConfig& config, const DotInfo& dot_info,
+    const TargetMachineFeatures& target_machine_features) {
+  CHECK(IsAlignedGemm(dot_info, target_machine_features));
+
+  if (ShouldUseMultiThreadedEigen(config)) {
+    return false;
+  }
+
+  int m = dot_info.result_shape.dimensions(0);
+  int k = dot_info.lhs_shape.dimensions(
+      dot_info.dim_nums.lhs_contracting_dimensions(0));
+  int n = dot_info.result_shape.dimensions(1);
+
+  if (!options::ForceEnableExperimentalLlvmIrGemm(config)) {
+    // TODO(sanjoy):  We should make these numbers micro-arch specific.
+    bool small_gemm =
+        k <= 128 && ((m <= 32 && n <= 128) || (m <= 128 && n <= 32));
+    if (!small_gemm) {
+      return false;
+    }
+  }
+
+  bool lhs_non_canonical = dot_info.dim_nums.lhs_contracting_dimensions(0) == 0;
+  bool rhs_non_canonical = dot_info.dim_nums.rhs_contracting_dimensions(0) == 1;
+
+  if (lhs_non_canonical || rhs_non_canonical) {
+    return false;
+  }
+
+  if (dot_info.result_shape.element_type() == F16) {
+    // TODO(sanjoy): This is probably easy to fix, but I want to keep the CL
+    // adding this comment NFC.
+    return false;
+  }
+
+  return true;
+}
+
+DotImplementationStrategy GetDotImplementationStrategy(
+    const HloModuleConfig& config, const DotInfo& dot_info,
+    const TargetMachineFeatures& target_machine_features) {
+  PrimitiveType element_type = dot_info.result_shape.element_type();
   // Any Matrix-Vector product of floating point or integral type, or
   // a transpose-dot fusion of the same can be lowered to a tiled LLVM
   // IR implementation.
-  const Shape& shape = dot.shape();
-  return shape.dimensions_size() == 2 &&
-         (shape.dimensions(0) == 1 || shape.dimensions(1) == 1) &&
-         (primitive_util::IsFloatingPointType(shape.element_type()) ||
-          primitive_util::IsIntegralType(shape.element_type()));
+  if (dot_info.result_shape.dimensions_size() == 2 &&
+      (dot_info.result_shape.dimensions(0) == 1 ||
+       dot_info.result_shape.dimensions(1) == 1) &&
+      (primitive_util::IsFloatingPointType(element_type) ||
+       primitive_util::IsIntegralType(element_type))) {
+    return DotImplementationStrategy::kTiledLlvmIrGemv;
+  }
+
+  if (IsAlignedGemm(dot_info, target_machine_features)) {
+    return CanEmitTiledLlvmIrGemm(config, dot_info, target_machine_features)
+               ? DotImplementationStrategy::kTiledLlvmIrGemm
+               : DotImplementationStrategy::kEigen;
+  }
+
+  return DotImplementationStrategy::kNaiveLlvmIr;
 }
+}  // namespace
 
+bool DotImplementationCanHandleTranspose(
+    const HloInstruction& dot_instr,
+    const TargetMachineFeatures& target_machine_features) {
+  DotImplementationStrategy impl_strategy =
+      GetDotImplementationStrategy(dot_instr.parent()->parent()->config(),
+                                   DotInfo(dot_instr), target_machine_features);
+
+  // TODO(sanjoy): This is not quite right, it should be `impl_strategy ==
+  // kEigen || impl_strategy == kTiledLlvmIrGemv || impl_strategy ==
+  // kNaiveLlvmIr` but I'll fix this in a later CL in the interest of keeping
+  // the CL adding this comment NFC.
+  return impl_strategy == DotImplementationStrategy::kTiledLlvmIrGemm ||
+         impl_strategy == DotImplementationStrategy::kEigen;
+}
+
+bool DotOperandsAndResultMustHaveRowMajorLayout(
+    const HloInstruction& dot_instr,
+    const TargetMachineFeatures& target_machine_features) {
+  DotImplementationStrategy impl_strategy =
+      GetDotImplementationStrategy(dot_instr.parent()->parent()->config(),
+                                   DotInfo(dot_instr), target_machine_features);
+
+  return impl_strategy == DotImplementationStrategy::kTiledLlvmIrGemm ||
+         impl_strategy == DotImplementationStrategy::kEigen;
+}
+
+Status EmitDotOperation(const HloInstruction& dot,
+                        const llvm_ir::IrArray& target_array,
+                        const llvm_ir::IrArray& lhs_array,
+                        const llvm_ir::IrArray& rhs_array,
+                        const llvm_ir::IrArray* addend_array,
+                        llvm::Value* executable_run_options_value,
+                        llvm::IRBuilder<>* b,
+                        const HloModuleConfig& hlo_module_config,
+                        const TargetMachineFeatures& target_machine_features) {
+  // This routine assumes that the dot operation is not in a parallelized
+  // enclosing computation.
+  CHECK(dot.parent()->root_instruction()->outer_dimension_partitions().empty());
+
+  PrimitiveType type = target_array.GetShape().element_type();
+  TF_RET_CHECK(F16 == type || F32 == type || F64 == type || C64 == type);
+  DotOpEmitter dot_emitter(DotInfo(dot), dot.name(), target_array, lhs_array,
+                           rhs_array, addend_array,
+                           executable_run_options_value, b, hlo_module_config,
+                           target_machine_features);
+  return dot_emitter.Emit();
+}
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
index 4c2041b556aa8bf8fe8fb8e0674c0f4f04f0acae..105bd3005c86d87443b2528eba7b0106ad70590e 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
@@ -30,9 +30,16 @@ limitations under the License.
 
 namespace xla {
 namespace cpu {
+// Returns true if the two operands and the output of `dot_instr` must have row
+// major layout.
+bool DotOperandsAndResultMustHaveRowMajorLayout(
+    const HloInstruction& dot_instr,
+    const TargetMachineFeatures& target_machine_features);
 
-bool PotentiallyImplementedAsEigenDot(
-    const HloInstruction& hlo,
+// Returns true our lowering strategy for `dot_instr` can fold in transposes to
+// the either of the inputs.
+bool DotImplementationCanHandleTranspose(
+    const HloInstruction& dot_instr,
     const TargetMachineFeatures& target_machine_features);
 
 // Returns the index for an operand to `hlo` that should ideally be column
@@ -41,129 +48,24 @@ bool PotentiallyImplementedAsEigenDot(
 absl::optional<int64> ProfitableToMakeDotOperandColumnMajor(
     const HloInstruction& hlo);
 
-// Returns true to indicate that we can generate a tiled LLVM IR implementation
-// for |dot|.
-bool ProfitableToImplementDotInTiledLlvmIr(const HloInstruction& dot);
-
-// Helper class for emitting LLVM IR to perform the dot operation.
-class DotOpEmitter {
- public:
-  // Emit LLVM IR to perform the dot operation on lhs_array and rhs_array and
-  // place the result in target_array. IR is emitted at current insert point of
-  // the builder. Upon completion of the method, the insert point is set to the
-  // end of all instructions emitted for this operation.
-  //
-  // If `addend_array` is not nullptr then it must be an array of the same
-  // dimensions as the result, and the result is computed as `addend_array` +
-  // dot(`lhs_array`, `rhs_array`).  A non-null `addend_array` is only supported
-  // for Matrix-vector products.
-  static Status EmitDotOperation(
-      const HloInstruction& dot, const llvm_ir::IrArray& target_array,
-      const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
-      const llvm_ir::IrArray* addend_array,
-      llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
-      const HloModuleConfig& hlo_module_config,
-      const TargetMachineFeatures& target_machine_features);
-
- private:
-  DotOpEmitter(const HloInstruction& dot, const llvm_ir::IrArray& target_array,
-               const llvm_ir::IrArray& lhs_array,
-               const llvm_ir::IrArray& rhs_array,
-               const llvm_ir::IrArray* addend_array,
-               llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
-               const HloModuleConfig& hlo_module_config,
-               const TargetMachineFeatures& target_machine_features);
-
-  // Emits the IR to perform the dot operation.
-  Status Emit();
-
-  // Emits instructions to perform a scalar dot product (a multiply of the
-  // LHS and RHS) and store the results in the target.
-  Status EmitScalarDot();
-
-  // Emit an LLVM IR implementation of the dot operation if we can.  Returns
-  // true if an LLVM IR implementation was emitted.
-  bool EmitLlvmIrDotIfProfitable();
-
-  // Emits a call to the CPU runtime to perform the matrix multiply.
-  Status EmitCallToRuntime();
-
-  // Represents the dimensions of a matrix-matrix multiply operation.
-  struct MatMultDims {
-    // The number of rows in the LHS.
-    int64 m;
-
-    // The number of columns in the LHS, which is also must be equal to the
-    // number of rows in the RHS.
-    int64 k;
-
-    // The number of columns on the RHS.
-    int64 n;
-
-    // True if the LHS matrix is column major.
-    bool lhs_column_major;
-
-    // True if the LHS contraction dimension is not 1.
-    bool lhs_non_canonical;
-
-    // True if the RHS matrix is column major.
-    bool rhs_column_major;
-
-    // True if the RHS contraction dimension is not 0.
-    bool rhs_non_canonical;
-
-    // True if the result matrix is column major.
-    bool target_column_major;
-  };
-
-  // Get the MatMultDims instance for the dot product this DotOpEmitter
-  // represents.  Precondition: the dot is of rank 2 (and thus its operands are
-  // of rank 2 as well).
-  MatMultDims GetMatMultDims() const;
-
-  bool EmitSmallGemmIfProfitable(const MatMultDims& mat_mult_dims);
-
-  // When doing a tiled GEMV in LLVM IR, a "tile" consists of this many vector
-  // registers.
-  int64 GetGemvTilingFactor() const {
-    const int64 kDefaultTilingFactor = 8;
-    return options::LlvmIrGemvTilingFactor(hlo_module_config_)
-        .value_or(kDefaultTilingFactor);
-  }
-
-  std::tuple<int64, int64, int64> GetGemmTileSize() const {
-    // Tuned for broadwell - Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
-    //
-    // TODO(b/80093688): Tune for other architectures and centralize this
-    // information in one place.
-    const std::tuple<int64, int64, int64> kDefaultTileSize =
-        std::tuple<int64, int64, int64>(11, 9, 1);
-    return options::LlvmIrGemmTileSize(hlo_module_config_)
-        .value_or(kDefaultTileSize);
-  }
-
-  // Returns true if we should use an experimental implementation of GEMM
-  // (general matrix matrix multiplication) if possible.
-  bool EnableExperimentalLlvmIrGemm() const {
-    return options::EnableExperimentalLlvmIrGemm(hlo_module_config_);
-  }
-
-  // Returns true if we should call into multi-threaded Eigen routines.
-  bool ShouldUseMultiThreadedEigen() {
-    return hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
-  }
-
-  const HloInstruction& dot_;
-  const llvm_ir::IrArray& target_array_;
-  const llvm_ir::IrArray& lhs_array_;
-  const llvm_ir::IrArray& rhs_array_;
-  const llvm_ir::IrArray* addend_array_;
-  llvm::Value* executable_run_options_value_;
-  llvm::IRBuilder<>* b_;
-  const HloModuleConfig& hlo_module_config_;
-  const TargetMachineFeatures& target_machine_features_;
-};
-
+// Emit LLVM IR to perform the dot operation on lhs_array and rhs_array and
+// place the result in target_array. IR is emitted at current insert point of
+// the builder. Upon completion of the method, the insert point is set to the
+// end of all instructions emitted for this operation.
+//
+// If `addend_array` is not nullptr then it must be an array of the same
+// dimensions as the result, and the result is computed as `addend_array` +
+// dot(`lhs_array`, `rhs_array`).  A non-null `addend_array` is only supported
+// for Matrix-vector products.
+Status EmitDotOperation(const HloInstruction& dot,
+                        const llvm_ir::IrArray& target_array,
+                        const llvm_ir::IrArray& lhs_array,
+                        const llvm_ir::IrArray& rhs_array,
+                        const llvm_ir::IrArray* addend_array,
+                        llvm::Value* executable_run_options_value,
+                        llvm::IRBuilder<>* b,
+                        const HloModuleConfig& hlo_module_config,
+                        const TargetMachineFeatures& target_machine_features);
 }  // namespace cpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter_internal.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter_internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc28918ed60a8086135846e2b9b1b9d75ec31ef6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter_internal.h
@@ -0,0 +1,88 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_DOT_OP_EMITTER_INTERNAL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_DOT_OP_EMITTER_INTERNAL_H_
+
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+
+// -----------------------------------------------------------------------------
+// INTERNAL HEADER.
+//
+// This file exposes internal implementation details from dot_op_emitter.cc for
+// unit tests.  Please do not depend on this!
+//
+// -----------------------------------------------------------------------------
+
+namespace xla {
+namespace cpu {
+namespace internal {
+
+// Represents a dot operation.  We use this in lieu of an `HloInstruction`
+// because we want to be able to create this for the "inner" dot operation in a
+// batch dot, for which there is no separate HLO instruction.
+struct DotInfo {
+  Shape lhs_shape;
+  Shape rhs_shape;
+  Shape result_shape;
+  DotDimensionNumbers dim_nums;
+
+  explicit DotInfo(const HloInstruction& instr) {
+    CHECK_EQ(instr.opcode(), HloOpcode::kDot);
+    lhs_shape = instr.operand(0)->shape();
+    rhs_shape = instr.operand(1)->shape();
+    result_shape = instr.shape();
+    dim_nums = instr.dot_dimension_numbers();
+  }
+};
+
+// Dictates how a dot operation is implemented.
+enum class DotImplementationStrategy {
+  // The dot operation is lowered into LLVM IR that implements a naive nested
+  // loop that computes the result one element at a time.  This is our
+  // "fallback"; we don't really want this to kick in for any non-trival dot
+  // operation.
+  kNaiveLlvmIr,
+
+  // The dot operation is lowered into LLVM IR that implements a tiled
+  // Matrix*Vector operation.  This strategy also allows fusing in a bias add
+  // into the dot.  The matrix can be row major or column major, both are
+  // supported.
+  kTiledLlvmIrGemv,
+
+  // The dot operation is lowered into LLVM IR that implemetns a tiled
+  // Matrix*Matrix operation.  No fusions are supported.  The two inputs
+  // and the output have to be row major.
+  kTiledLlvmIrGemm,
+
+  // The dot operation is lowered into a call into an Eigen routine.  No fusions
+  // are supported today.  The two inputs and the output have to be row major.
+  // However, we do allow transposing either the LHS or the RHS as part of the
+  // GEMM -- we expose this flexibility as flexibility in the contraction
+  // dimensions, but we can also see this as flexibility in the input layouts.
+  kEigen,
+};
+
+// Returns the implementation strategy for a dot with the configuration
+// `dot_info`.
+DotImplementationStrategy GetDotImplementationStrategy(
+    const HloModuleConfig& config, const DotInfo& dot_info,
+    const TargetMachineFeatures& target_machine_features);
+}  // namespace internal
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_DOT_OP_EMITTER_INTERNAL_H_
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 169d628923b05c43590782bcbebb0e5e539d83e3..91e369335455669c1ea16cedc930eb2c34b76abe 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -24,11 +24,9 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+// IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/core/lib/math/math_util.h"
-#include "tensorflow/core/platform/logging.h"
-// IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/types/span.h"
@@ -70,6 +68,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/math/math_util.h"
+#include "tensorflow/core/platform/logging.h"
 
 namespace xla {
 
@@ -970,10 +970,10 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
           << llvm_ir::DumpToString(*target_array.GetBasePointer());
 
   // Dot operation is complicated so we delegate to a helper class.
-  return DotOpEmitter::EmitDotOperation(
-      *dot, target_array, lhs_array, rhs_array, /*addend_array=*/nullptr,
-      GetExecutableRunOptionsArgument(), &b_, hlo_module_config_,
-      target_machine_features_);
+  return EmitDotOperation(*dot, target_array, lhs_array, rhs_array,
+                          /*addend_array=*/nullptr,
+                          GetExecutableRunOptionsArgument(), &b_,
+                          hlo_module_config_, target_machine_features_);
 }
 
 StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForConvolution(
@@ -1399,7 +1399,7 @@ static bool ReductionPreservesLayout(const HloInstruction& reduce) {
 
   int64 delta = 0;
   for (int64 i = 0; i < operand_shape.dimensions_size(); i++) {
-    if (reduced_dims.count(i)) {
+    if (reduced_dims.contains(i)) {
       delta++;
     } else {
       InsertOrDie(&unreduced_dim_map, i, i - delta);
@@ -1412,7 +1412,7 @@ static bool ReductionPreservesLayout(const HloInstruction& reduce) {
   for (int64 operand_dim_idx = 0;
        operand_dim_idx < operand_shape.dimensions_size(); operand_dim_idx++) {
     int64 operand_dim = operand_shape.layout().minor_to_major(operand_dim_idx);
-    if (!reduced_dims.count(operand_dim)) {
+    if (!reduced_dims.contains(operand_dim)) {
       if (FindOrDie(unreduced_dim_map, operand_dim) !=
           result_shape.layout().minor_to_major(result_dim_idx++)) {
         return false;
@@ -1709,10 +1709,8 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
       vectorization_factor_in_bytes /
       ShapeUtil::ByteSizeOfPrimitiveType(reduce->shape().element_type());
 
-  bool is_reduction_over_minor_dimension =
-      std::find(dimensions.begin(), dimensions.end(),
-                LayoutUtil::Minor(arg->shape().layout(), 0)) !=
-      dimensions.end();
+  bool is_reduction_over_minor_dimension = absl::c_linear_search(
+      dimensions, LayoutUtil::Minor(arg->shape().layout(), 0));
 
   unsigned element_alignment = tensorflow::MathUtil::GCD<unsigned>(
       ShapeUtil::ByteSizeOfPrimitiveType(reduce->shape().element_type()),
@@ -1990,7 +1988,7 @@ Status IrEmitter::HandleSlice(HloInstruction* slice) {
   // The memcpy will copy elements that are logically this shape (allowed to be
   // scalar).
   const Shape logical_element_shape = ShapeUtil::FilterDimensions(
-      [&inner_dims](int64 dim) -> bool { return inner_dims.count(dim); },
+      [&inner_dims](int64 dim) { return inner_dims.contains(dim); },
       operand->shape());
 
   const int64 primitive_elements_per_logical_element =
@@ -2205,10 +2203,10 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
     llvm_ir::IrArray addend_array(
         GetIrArrayFor(fusion->operand(addend_param_number)));
 
-    TF_RETURN_IF_ERROR(DotOpEmitter::EmitDotOperation(
-        *dot, target_array, lhs_array, rhs_array, &addend_array,
-        GetExecutableRunOptionsArgument(), &b_, hlo_module_config_,
-        target_machine_features_));
+    TF_RETURN_IF_ERROR(
+        EmitDotOperation(*dot, target_array, lhs_array, rhs_array,
+                         &addend_array, GetExecutableRunOptionsArgument(), &b_,
+                         hlo_module_config_, target_machine_features_));
     return Status::OK();
   } else {
     return Unimplemented("Fusion kind not implemented on CPU");
@@ -2401,8 +2399,7 @@ StatusOr<bool> IrEmitter::EmitFastConcatenate(
   int64 concat_dim = concatenate->dimensions(0);
   const Layout& output_layout = output_shape.layout();
   auto output_min2maj = LayoutUtil::MinorToMajor(output_layout);
-  auto concat_dim_layout_itr =
-      std::find(output_min2maj.begin(), output_min2maj.end(), concat_dim);
+  auto concat_dim_layout_itr = absl::c_find(output_min2maj, concat_dim);
 
   std::vector<int64> inner_dims(output_min2maj.begin(), concat_dim_layout_itr);
   std::vector<int64> outer_dims(std::next(concat_dim_layout_itr),
@@ -2956,8 +2953,7 @@ Status IrEmitter::ElementTypesSameAndSupported(
 
   TF_RET_CHECK(!operands.empty());
   PrimitiveType primitive_type = operands[0]->shape().element_type();
-  if (std::find(supported_types.begin(), supported_types.end(),
-                primitive_type) == supported_types.end()) {
+  if (!absl::c_linear_search(supported_types, primitive_type)) {
     return Unimplemented("unsupported operand type %s in op %s",
                          PrimitiveType_Name(primitive_type),
                          HloOpcodeString(instruction.opcode()));
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index db76de4bb2b8ed568bf2557a30fa216d0cbe518d..a6fb11dcbf9bb201ba8837866e2f509c48bfd061 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -448,7 +448,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
       computation_to_profile_idx_;
 
   // Maps HLOs to Values emitted for them.
-  std::unordered_map<const HloInstruction*, llvm::Value*> emitted_value_;
+  absl::flat_hash_map<const HloInstruction*, llvm::Value*> emitted_value_;
 
   llvm_ir::AliasAnalysis alias_analysis_;
 
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
index 3b423f639137149f7efdee91a02bbcf01d1cc6e1..6121d1ca9a5c785cedd947200d3e7e320aa06bc2 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
@@ -146,8 +146,6 @@ int64 ParallelTaskAssignment::GetTargetParallelTaskCount(
       (opcode == HloOpcode::kConvolution &&
        PotentiallyImplementedAsEigenConvolution(*instruction,
                                                 target_machine_features_)) ||
-      PotentiallyImplementedAsEigenDot(*instruction,
-                                       target_machine_features_) ||
       (opcode == HloOpcode::kFusion &&
        instruction->fusion_kind() != HloInstruction::FusionKind::kLoop) ||
       instruction->shape().IsTuple()) {
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
index 1ed743afc30af7c7ff38c7d2a738f2e376270952..1f7204e67a413efabd34cd7d88ced4c82ee7a5df 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
@@ -20,6 +20,10 @@ limitations under the License.
 #include "tensorflow/core/platform/dynamic_annotations.h"
 #include "tensorflow/core/platform/types.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 using tensorflow::int32;
 using tensorflow::int64;
 
diff --git a/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.cc b/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb6c44b70ab34d0a294880b5de4fe0b3ba5e19e5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.cc
@@ -0,0 +1,1014 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h"
+
+#include "tensorflow/compiler/xla/service/cpu/vector_support_library.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+using tensorflow::int64;
+
+// Provides tiled access to an in-memory rank 2 array.
+class MemoryTile {
+ public:
+  // Constructs a MemoryTile that can operate on tiles consisting of
+  // `tile_size_along_major_dim` vectors from the matrix `matrix`, starting at
+  // `major_dim_offset` in the major dimension.  The tile size along the minor
+  // dimension is the vector size, and that is implicitly determined by `vsl`.
+  MemoryTile(VectorSupportLibrary* vsl, llvm::IRBuilder<>* b,
+             llvm::Value* matrix, int64 matrix_size_along_minor_dim,
+             llvm::Value* major_dim_offset, int64 tile_size_along_major_dim)
+      : vsl_(vsl), b_(b) {
+    pointers_.reserve(tile_size_along_major_dim);
+    for (int64 i = 0; i < tile_size_along_major_dim; i++) {
+      llvm::Value* total_offset =
+          b->CreateMul(b->getInt64(matrix_size_along_minor_dim),
+                       b->CreateAdd(b->getInt64(i), major_dim_offset));
+      pointers_.push_back(vsl_->ComputeOffsetPointer(matrix, total_offset));
+    }
+  }
+
+  // Load a tile consisting of `tile_size_along_major_dim` vectors from position
+  // {major: `major_dim_offset`, minor: `minor_dim_offset`}.
+  //
+  // Note: `major_dim_offset` is a parameter to the constructor.
+  std::vector<llvm::Value*> LoadTile(llvm::Value* minor_dim_offset) const {
+    std::vector<llvm::Value*> result;
+    result.reserve(pointers_.size());
+    for (const auto& pointer : pointers_) {
+      result.push_back(vsl_->LoadVector(pointer, minor_dim_offset));
+    }
+    return result;
+  }
+
+  // Stores `tile` to position {major: `major_dim_offset`, minor:
+  // `minor_dim_offset`}.
+  //
+  // Note: `major_dim_offset` is a parameter to the constructor.
+  void StoreTile(absl::Span<llvm::Value* const> tile,
+                 llvm::Value* minor_dim_offset) const {
+    CHECK_EQ(tile.size(), pointers_.size());
+    for (int64 i = 0; i < pointers_.size(); i++) {
+      vsl_->StoreVector(tile[i], pointers_[i], minor_dim_offset);
+    }
+  }
+
+  // Loads a tile of size [`tile_size_along_major_dim`,
+  // `tile_size_along_middle_dim`] from position {major: `major_dim_offset`,
+  // minor: `minor_dim_offset`} and then broadcasts each element into a vector
+  // of size vsl_.vector_size().  The (i,j)'th element of the return value is
+  // the (i,j)'th element in the tile broadcasted into an LLVM vector.
+  //
+  // Note: `major_dim_offset` is a parameter to the constructor.
+  std::vector<std::vector<llvm::Value*>> LoadBroadcastTile(
+      llvm::Value* minor_dim_offset, int64 tile_size_along_middle_dim) const {
+    std::vector<std::vector<llvm::Value*>> result;
+    result.resize(pointers_.size());
+    for (int64 i = 0; i < pointers_.size(); i++) {
+      for (int64 j = 0; j < tile_size_along_middle_dim; j++) {
+        result[i].push_back(vsl_->LoadBroadcast(
+            pointers_[i], b_->CreateAdd(minor_dim_offset, b_->getInt64(j))));
+      }
+    }
+    return result;
+  }
+
+ private:
+  VectorSupportLibrary* vsl_;
+  llvm::IRBuilder<>* b_;
+  std::vector<llvm::Value*> pointers_;
+};
+
+// The base class for the classes representing the GEMV emitter configurations.
+//
+// The IR emitted (modulo the LLVM values representing the input and output
+// buffers) by the row major and column major GEMV emitters should be a function
+// of their configuration.  This is important because their configuration is
+// used as a key to cache the generated IR.
+class GemvConfig {
+ public:
+  // Mixin for convenience.
+  template <typename T>
+  struct User {
+   public:
+    PrimitiveType scalar_type() const {
+      return derived().config().scalar_type();
+    }
+    int64 tile_rows() const { return derived().config().tile_rows(); }
+    int64 tile_cols() const { return derived().config().tile_cols(); }
+    int64 m() const { return derived().config().m(); }
+    int64 k() const { return derived().config().k(); }
+    int64 has_addend() const { return derived().config().has_addend(); }
+
+   private:
+    const T& derived() const { return *static_cast<const T*>(this); }
+  };
+
+  PrimitiveType scalar_type() const { return scalar_type_; }
+  int64 tile_rows() const { return tile_rows_; }
+  int64 tile_cols() const { return tile_cols_; }
+  int64 m() const { return m_; }
+  int64 k() const { return k_; }
+  bool has_addend() const { return has_addend_; }
+
+  string GetCacheKey() const {
+    return absl::StrCat(name_, "_", PrimitiveType_Name(scalar_type()), "_",
+                        tile_rows(), "_", tile_cols(), "_", m(), "_", k(),
+                        has_addend() ? "_with_addend" : "");
+  }
+
+ protected:
+  explicit GemvConfig(string name, PrimitiveType scalar_type, int64 tile_rows,
+                      int64 tile_cols, int64 m, int64 k, bool has_addend)
+      : name_(std::move(name)),
+        scalar_type_(scalar_type),
+        tile_rows_(tile_rows),
+        tile_cols_(tile_cols),
+        m_(m),
+        k_(k),
+        has_addend_(has_addend) {}
+
+ private:
+  string name_;
+  PrimitiveType scalar_type_;
+  int64 tile_rows_;
+  int64 tile_cols_;
+  int64 m_;
+  int64 k_;
+  bool has_addend_;
+};
+
+// Computes a dot product between "[M,K]{0,1} lhs" with a [K,1] vector (the
+// layout of the vector does not matter).  This implementation uses a tiling
+// scheme to improve performance.
+//
+// We logically separate the LHS matrix into four segments:
+//
+//   +----------------------+---+
+//   |                      |   |
+//   |                      |   |
+//   |         A            | B |
+//   |                      |   |
+//   |                      |   |
+//   |                      |   |
+//   +----------------------+---+
+//   |         C            | D |
+//   +----------------------+---+
+//
+// where A is the largest submatrix of the LHS that can be evenly dividied into
+// tiles.  For each tile in A, assuming tile_rows_ == tile_cols_ == 4, we have:
+//
+//   +---+---+---+---+       +--+--+--+--+
+//   |M00|M10|M20|M30|       |V0|V1|V2|V3|
+//   +---+---+---+---+       +--+--+--+--+
+//   |M01|M11|M21|M31| and   |V0|V1|V2|V3|
+//   +---+---+---+---+       +--+--+--+--+
+//   |M02|M12|M22|M32|       |V0|V1|V2|V3|
+//   +---+---+---+---+       +--+--+--+--+
+//   |M03|M13|M23|M33|       |V0|V1|V2|V3|
+//   +---+---+---+---+       +--+--+--+--+
+//
+// (Legend: rows are horizontal and columns are vertical; and each column is one
+// llvm::Value of a vector type)
+//
+// where:
+//
+//   a. The left tile is from the column major left matrix.
+//   b. The right tile is an elementwise broadcast of a [V0, V1, V2, V3]
+//      vector loaded from the RHS vector.
+//
+// As we iterate through the column dimension, we compute the change to the
+// result vector by an elementwise multiplication between the two tiles above
+// followed by a reduction along the major dimension:
+//
+//                     +-----------------------------------+
+//                     | M00*V0 + M10*V1 + M20*V2 + M30*V3 |
+//                     +-----------------------------------+
+//                     | M01*V0 + M11*V1 + M21*V2 + M31*V3 |
+// Result[R:R+4] +=    +-----------------------------------+
+//                     | M02*V0 + M12*V1 + M22*V2 + M32*V3 |
+//                     +-----------------------------------+
+//                     | M03*V0 + M13*V1 + M23*V2 + M33*V3 |
+//                     +-----------------------------------+
+//
+// Where R is the starting row for the tile.
+//
+// We have an inner epilogue loop to deal with the "C" submatrix and an outer
+// epilogue loop to deal with the B,D submarix.
+//
+// TODO(sanjoy): We should investigate if using gather loads and scatter stores
+// can be used here have the same inner loop for both column-major and row-major
+// matrix-vector products.
+class ColumnMajorMatrixVectorProductEmitter
+    : public GemvConfig::User<ColumnMajorMatrixVectorProductEmitter> {
+ public:
+  class Config : public GemvConfig {
+   public:
+    explicit Config(PrimitiveType scalar_type, int64 tile_rows, int64 tile_cols,
+                    int64 m, int64 k, bool has_addend)
+        : GemvConfig(/*name=*/"col_major_gemv", scalar_type,
+                     /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols, /*m=*/m,
+                     /*k=*/k, /*has_addend=*/has_addend) {}
+  };
+
+  ColumnMajorMatrixVectorProductEmitter(const Config& config, llvm::Value* lhs,
+                                        llvm::Value* rhs, llvm::Value* addend,
+                                        llvm::Value* result,
+                                        llvm::IRBuilder<>* b)
+      : config_(config),
+        lhs_(lhs),
+        rhs_(rhs),
+        addend_(addend),
+        result_(result),
+        b_(b),
+        ksl_(b_),
+        vsl_(config.scalar_type(), /*vector_size=*/config.tile_rows(), b_, "") {
+    CHECK(tile_rows() > 0 && IsPowerOfTwo(static_cast<uint64>(tile_rows())));
+    CHECK(!has_addend() || addend != nullptr);
+  }
+
+  void Emit();
+
+  const Config& config() const { return config_; }
+
+ private:
+  void EmitOuterLoopBody(llvm::Value* column, int64 column_count,
+                         bool is_first_column);
+
+  MemoryTile GetLhsMemoryTile(llvm::Value* column_start, int64 column_count) {
+    return MemoryTile(&vsl_, b_, /*matrix=*/lhs_,
+                      /*matrix_size_along_minor_dim=*/m(),
+                      /*major_dim_offset=*/column_start,
+                      /*tile_size_along_major_dim=*/column_count);
+  }
+
+  // Load a tile of values from the RHS.  For the RHS a "tile" is a contiguous
+  // sequence of `count` values, each one broadcasted to the vector width.
+  std::vector<llvm::Value*> LoadRhsTile(llvm::Value* offset, int64 count) {
+    llvm::Value* base_pointer = vsl_.ComputeOffsetPointer(rhs_, offset);
+    std::vector<llvm::Value*> result;
+    result.reserve(count);
+    for (int64 i = 0; i < count; i++) {
+      result.push_back(vsl_.LoadBroadcast(base_pointer, i));
+    }
+    return result;
+  }
+
+  void EmitInnerLoopTiled(MemoryTile* lhs_memory_tile,
+                          const std::vector<llvm::Value*>& rhs_tile,
+                          int64 columns, bool is_first_column);
+
+  void EmitInnerLoopEpilogue(llvm::Value* current_tile_col, int64 columns,
+                             bool is_first_tiled_column);
+
+  Config config_;
+  llvm::Value* lhs_;
+  llvm::Value* rhs_;
+  llvm::Value* addend_;
+  llvm::Value* result_;
+  llvm::IRBuilder<>* b_;
+  KernelSupportLibrary ksl_;
+  VectorSupportLibrary vsl_;
+};
+
+void ColumnMajorMatrixVectorProductEmitter::EmitOuterLoopBody(
+    llvm::Value* column, int64 column_count, bool is_first_column) {
+  MemoryTile lhs_memory_tile = GetLhsMemoryTile(/*column_start=*/column,
+                                                /*column_count=*/column_count);
+
+  std::vector<llvm::Value*> rhs_tile =
+      LoadRhsTile(column, /*count=*/column_count);
+  EmitInnerLoopTiled(&lhs_memory_tile, rhs_tile,
+                     /*columns=*/column_count, is_first_column);
+  EmitInnerLoopEpilogue(column, /*columns=*/column_count, is_first_column);
+}
+
+void ColumnMajorMatrixVectorProductEmitter::Emit() {
+  // See the comment on the class declaration for the algorithm used here.
+  int64 column_remainder = k() % tile_cols();
+  int64 column_limit = k() - column_remainder;
+
+  ksl_.For("dot.outer.tiled",
+           /*start=*/0, /*end=*/column_limit, /*step=*/tile_cols(),
+           [&](llvm::Value* column, bool is_first_column) {
+             EmitOuterLoopBody(column, tile_cols(), is_first_column);
+           });
+
+  if (column_remainder != 0) {
+    EmitOuterLoopBody(b_->getInt64(column_limit), column_remainder,
+                      column_limit == 0);
+  }
+}
+
+void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
+    MemoryTile* lhs_memory_tile, const std::vector<llvm::Value*>& rhs_tile,
+    int64 columns, bool is_first_column) {
+  int64 row_limit = m() - (m() % tile_rows());
+
+  ksl_.For("dot.inner.tiled", /*start=*/0, /*end=*/row_limit,
+           /*step=*/tile_rows(), [&](llvm::Value* row) {
+             std::vector<llvm::Value*> lhs_tile =
+                 lhs_memory_tile->LoadTile(/*minor_dim_offset=*/row);
+             llvm::Value* accumulator =
+                 is_first_column ? (addend_ ? vsl_.LoadVector(addend_, row)
+                                            : vsl_.GetZeroVector())
+                                 : vsl_.LoadVector(result_, row);
+             for (int i = 0; i < columns; i++) {
+               accumulator = vsl_.MulAdd(lhs_tile[i], rhs_tile[i], accumulator);
+             }
+             vsl_.StoreVector(accumulator, result_, row);
+           });
+}
+
+void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
+    llvm::Value* current_tile_col, int64 columns, bool is_first_tiled_column) {
+  int64 row_start = m() - (m() % tile_rows());
+  if (row_start == m()) {
+    return;
+  }
+
+  llvm::Value* columns_llvm = b_->getInt64(columns);
+
+  // for (col = current_tile_col; col < (columns + current_tile_col); col++)
+  //   for (row = row_start, row < m_; row++) {
+  //     result[row] += lhs[row, col] * rhs[col]
+  //     // Also take into account that if col is 0 then result[row] is not
+  //     // initialized.
+  //   }
+
+  ksl_.For(
+      "dot.inner.epilg.outer", /*start=*/current_tile_col,
+      /*end=*/b_->CreateAdd(columns_llvm, current_tile_col),
+      /*step=*/1, /*peel_first_iteration=*/false,
+      [&](llvm::Value* col, llvm::Value* is_first_scalar_col) {
+        llvm::Value* rhs_element = vsl_.LoadScalar(rhs_, col);
+        llvm::Value* total_offset = b_->CreateMul(col, b_->getInt64(m()));
+        llvm::Value* lhs_base_pointer =
+            vsl_.ComputeOffsetPointer(lhs_, total_offset);
+        ksl_.For(
+            "dot.inner.epilg.inner", /*start=*/row_start, /*end=*/m(),
+            /*step=*/1, [&](llvm::Value* scalar_row) {
+              llvm::Value* product = vsl_.Mul(
+                  vsl_.LoadScalar(lhs_base_pointer, scalar_row), rhs_element);
+              llvm::Value* setting_result_first_time = b_->CreateAnd(
+                  is_first_scalar_col, b_->getInt1(is_first_tiled_column));
+              ksl_.If(
+                  setting_result_first_time,
+                  /*true_block_generator=*/
+                  [&]() {
+                    if (addend_) {
+                      vsl_.StoreScalar(
+                          vsl_.Add(vsl_.LoadScalar(addend_, scalar_row),
+                                   product),
+                          result_, scalar_row);
+                    } else {
+                      vsl_.StoreScalar(product, result_, scalar_row);
+                    }
+                  },
+                  /*false_block_generator=*/
+                  [&]() {
+                    vsl_.StoreScalar(
+                        vsl_.Add(vsl_.LoadScalar(result_, scalar_row), product),
+                        result_, scalar_row);
+                  });
+            });
+      });
+}
+
+// Computes a dot product between "[M,K]{1,0} lhs" with a [K,1] vector (the
+// layout of the vector does not matter).  This implementation uses a tiling
+// scheme to improve performance.
+//
+// We logically separate the LHS matrix into four segments:
+//
+//   +----------------------+---+
+//   |                      |   |
+//   |                      |   |
+//   |         A            | B |
+//   |                      |   |
+//   |                      |   |
+//   |                      |   |
+//   +----------------------+---+
+//   |         C            | D |
+//   +----------------------+---+
+//
+// where A is the largest submatrix of the LHS that can be evenly dividied into
+// tiles.  For each tile in A, assuming tile_rows_ == tile_cols_ == 4, we have:
+//
+//   +---+---+---+---+
+//   |M00|M10|M20|M30|
+//   +---+---+---+---+       +--+--+--+--+
+//   |M01|M11|M21|M31| and   |V0|V1|V2|V3|
+//   +---+---+---+---+       +--+--+--+--+
+//   |M02|M12|M22|M32|
+//   +---+---+---+---+
+//   |M03|M13|M23|M33|
+//   +---+---+---+---+
+//
+// (Legend: rows are horizontal and columns are vertical; and each row is one
+// llvm::Value of a vector type)
+//
+// where:
+//
+//   a. The left tile is loaded from the row major left matrix.
+//   b. The right vector is loaded from the RHS vector.
+//
+// We keep 4 vector accumulators accumulating the following four vector
+// expressions as we iterate over the row dimension:
+//
+//   +------+------+------+------+
+//   |M0I*V0|M1I*V1|M2I*V2|M3I*V3|  for I in [0,4)
+//   +------+------+------+------+
+//
+// In the end we do a horizontal reduction over these 4 vector accumulators to
+// get 4 values in the result vector.
+//
+// We have an inner epilogue loop to deal with the "B" sub-matrix and an outer
+// epilogue loop to deal with the C,D submatrix.
+class RowMajorMatrixVectorProductEmitter
+    : public GemvConfig::User<RowMajorMatrixVectorProductEmitter> {
+ public:
+  class Config : public GemvConfig {
+   public:
+    explicit Config(PrimitiveType scalar_type, int64 tile_rows, int64 tile_cols,
+                    int64 m, int64 k, bool has_addend)
+        : GemvConfig(/*name=*/"row_major_gemv", scalar_type,
+                     /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols, /*m=*/m,
+                     /*k=*/k, /*has_addend=*/has_addend) {}
+  };
+
+  RowMajorMatrixVectorProductEmitter(const Config& config, llvm::Value* lhs,
+                                     llvm::Value* rhs, llvm::Value* addend,
+                                     llvm::Value* result, llvm::IRBuilder<>* b)
+      : config_(config),
+        lhs_(lhs),
+        rhs_(rhs),
+        addend_(addend),
+        result_(result),
+        b_(b),
+        ksl_(b_),
+        vsl_(scalar_type(), /*vector_size=*/tile_cols(), b_, "") {
+    CHECK(tile_cols() > 0 && IsPowerOfTwo(static_cast<uint64>(tile_cols())));
+    CHECK(!has_addend() || addend != nullptr);
+  }
+
+  void Emit();
+
+  const Config& config() const { return config_; }
+
+ private:
+  MemoryTile GetLhsMemoryTile(llvm::Value* row_start, int64 row_count) {
+    return MemoryTile(&vsl_, b_, /*matrix=*/lhs_,
+                      /*matrix_size_along_minor_dim=*/k(),
+                      /*major_dim_offset=*/row_start,
+                      /*tile_size_along_major_dim=*/row_count);
+  }
+
+  void EmitOuterLoopBody(llvm::Value* row, int64 row_count);
+
+  void EmitInnerLoopTiled(MemoryTile* lhs_memory_tile, int64 rows,
+                          std::vector<VectorVariable>* vector_accumulators);
+
+  void EmitInnerLoopEpilogue(llvm::Value* current_tile_row, int64 rows,
+                             std::vector<ScalarVariable>* scalar_accumulators);
+
+  Config config_;
+  llvm::Value* lhs_;
+  llvm::Value* rhs_;
+  llvm::Value* addend_;
+  llvm::Value* result_;
+  llvm::IRBuilder<>* b_;
+  KernelSupportLibrary ksl_;
+  VectorSupportLibrary vsl_;
+};
+
+void RowMajorMatrixVectorProductEmitter::EmitOuterLoopBody(llvm::Value* row,
+                                                           int64 row_count) {
+  MemoryTile lhs_memory_tile = GetLhsMemoryTile(/*row_start=*/row,
+                                                /*row_count=*/row_count);
+  std::vector<VectorVariable> vector_accumulators;
+  std::vector<ScalarVariable> scalar_accumulators;
+  for (int i = 0; i < row_count; i++) {
+    vector_accumulators.emplace_back(&vsl_, vsl_.GetZeroVector());
+    scalar_accumulators.emplace_back(&vsl_, vsl_.GetZeroScalar());
+  }
+  EmitInnerLoopTiled(&lhs_memory_tile, /*rows=*/row_count,
+                     &vector_accumulators);
+  EmitInnerLoopEpilogue(/*current_tile_row=*/row, /*rows=*/row_count,
+                        &scalar_accumulators);
+
+  std::vector<llvm::Value*> accumulator_values;
+  std::transform(
+      vector_accumulators.begin(), vector_accumulators.end(),
+      std::back_inserter(accumulator_values),
+      [](const VectorVariable& vector_var) { return vector_var.Get(); });
+
+  std::vector<llvm::Value*> horizontal_sums;
+  if (row_count == vsl_.vector_size()) {
+    if (addend_) {
+      horizontal_sums = vsl_.ComputeHorizontalSums(
+          std::move(accumulator_values), vsl_.LoadVector(addend_, row));
+    } else {
+      horizontal_sums =
+          vsl_.ComputeHorizontalSums(std::move(accumulator_values));
+    }
+  } else {
+    horizontal_sums = vsl_.ComputeHorizontalSums(std::move(accumulator_values));
+  }
+
+  for (int i = 0; i < row_count; i++) {
+    llvm::Value* result_value =
+        vsl_.Add(horizontal_sums[i], scalar_accumulators[i].Get());
+    llvm::Value* offset = b_->CreateAdd(b_->getInt64(i), row);
+    if (addend_ && row_count != vsl_.vector_size()) {
+      result_value = vsl_.Add(vsl_.LoadScalar(addend_, offset), result_value);
+    }
+    vsl_.StoreScalar(result_value, result_, offset);
+  }
+}
+
+void RowMajorMatrixVectorProductEmitter::Emit() {
+  // See the comment on the class declaration for the algorithm used here.
+  int64 row_remainder = m() % tile_rows();
+  int64 row_limit = m() - row_remainder;
+
+  ksl_.For("dot.outer.tiled",
+           /*start=*/0, /*end=*/row_limit, /*step=*/tile_rows(),
+           [&](llvm::Value* row) { EmitOuterLoopBody(row, tile_rows()); });
+
+  if (row_remainder != 0) {
+    EmitOuterLoopBody(b_->getInt64(row_limit), row_remainder);
+  }
+}
+
+void RowMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
+    MemoryTile* lhs_memory_tile, int64 rows,
+    std::vector<VectorVariable>* vector_accumulators) {
+  int64 column_limit = k() - (k() % tile_cols());
+
+  ksl_.For("dot.inner.tiled", /*start=*/0, /*end=*/column_limit,
+           /*step=*/tile_cols(), [&](llvm::Value* col) {
+             std::vector<llvm::Value*> lhs_tile =
+                 lhs_memory_tile->LoadTile(/*minor_dim_offset=*/col);
+             llvm::Value* rhs_value = vsl_.LoadVector(rhs_, col);
+             for (int i = 0; i < rows; i++) {
+               llvm::Value* old_sum = (*vector_accumulators)[i].Get();
+               (*vector_accumulators)[i].Set(
+                   vsl_.Add(old_sum, vsl_.Mul(rhs_value, lhs_tile[i])));
+             }
+           });
+}
+
+void RowMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
+    llvm::Value* current_tile_row, int64 rows,
+    std::vector<ScalarVariable>* scalar_accumulators) {
+  int64 column_start = k() - (k() % tile_cols());
+  if (column_start == k()) {
+    return;
+  }
+
+  for (int r = 0; r < rows; r++) {
+    llvm::Value* total_offset = b_->CreateMul(
+        b_->CreateAdd(b_->getInt64(r), current_tile_row), b_->getInt64(k()));
+    llvm::Value* lhs_base_pointer =
+        vsl_.ComputeOffsetPointer(lhs_, total_offset);
+    ksl_.For("dot.inner.epilg.inner", /*start=*/column_start, /*end=*/k(),
+             /*step=*/1, [&](llvm::Value* scalar_col) {
+               llvm::Value* product =
+                   vsl_.Mul(vsl_.LoadScalar(lhs_base_pointer, scalar_col),
+                            vsl_.LoadScalar(rhs_, scalar_col));
+               llvm::Value* old_value = (*scalar_accumulators)[r].Get();
+               (*scalar_accumulators)[r].Set(vsl_.Add(old_value, product));
+             });
+  }
+}
+
+// This class implements a tiled matrix multiplication algorithm, intended for
+// multiplying small matrices that don't need cache tiling.
+//
+// In the future this can be used as the innermost GEBP loop in a GEMM kernel as
+// described in "Goto, Kazushige, and Robert A. Geijn. "Anatomy of
+// high-performance matrix multiplication." ACM Transactions on Mathematical
+// Software (TOMS) 34.3 (2008): 12.".
+//
+// This only supports canonical dot operations (i.e. where the lhs contraction
+// dimension is 1 and the rhs contraction dimension is 0) over row major
+// matrices.
+class TiledSmallGemmEmitter {
+ public:
+  // Describe the dimensions of the kernel.
+  class Dimensions {
+   public:
+    explicit Dimensions(int64 m, int64 k, int64 n) : m_(m), k_(k), n_(n) {}
+
+    int64 m() const { return m_; }
+    int64 k() const { return k_; }
+    int64 n() const { return n_; }
+
+    string ToString() const { return absl::StrCat(m(), "x", k(), "x", n()); }
+
+   private:
+    const int64 m_;
+    const int64 k_;
+    const int64 n_;
+  };
+
+  // Represents the configuration of the emitter.  The LLVM IR emitted by the
+  // emitter, modulo the LLVM values holding the input and output buffers, must
+  // be a function of the instance of `Config` passed to it.
+  //
+  // `dims` holds the matrix multiplication dimensions.
+  //
+  // `max_vectorization_width` is the maximum vector width (i.e. the width of
+  // the largest vector register we will use).  This can be larger than the
+  // largest vector register supported by the machine -- LLVM will legalize
+  // these large vector widths into legally sized vectors.
+  //
+  // `max_vector_count` is the maximum number of vectors of size
+  // `max_vectorization_width` that we will attempt to process at once.
+  //
+  // `min_vectorization_width` is the smallest vector width the emitter will use
+  // -- below that it will devolve to using a scalar loop.
+  //
+  // The innermost reduction loop executes the matrix multiply in tiles of size
+  // [`tile_size_m`, `tile_size_k`] from the LHS and [`tile_size_k`,
+  // <vectorization width>] in the RHS.
+  class Config {
+   public:
+    explicit Config(PrimitiveType scalar_type, Dimensions dims,
+                    int64 max_vectorization_width, int64 max_vector_count,
+                    int64 min_vectorization_width, int64 tile_size_m,
+                    int64 tile_size_k)
+        : scalar_type_(scalar_type),
+          dims_(dims),
+          max_vectorization_width_(max_vectorization_width),
+          max_vector_count_(max_vector_count),
+          min_vectorization_width_(min_vectorization_width),
+          tile_size_m_(tile_size_m),
+          tile_size_k_(tile_size_k) {}
+
+    string GetCacheKey() const {
+      return absl::StrCat("gemm_", PrimitiveType_Name(scalar_type()), "_",
+                          dims().ToString(), "_", max_vectorization_width(),
+                          "_", min_vectorization_width(), "_", tile_size_m(),
+                          "_", tile_size_k());
+    }
+
+    PrimitiveType scalar_type() const { return scalar_type_; }
+    Dimensions dims() const { return dims_; }
+    int64 max_vectorization_width() const { return max_vectorization_width_; }
+    int64 max_vector_count() const { return max_vector_count_; }
+    int64 min_vectorization_width() const { return min_vectorization_width_; }
+
+    int64 tile_size_m() const { return tile_size_m_; }
+    int64 tile_size_k() const { return tile_size_k_; }
+
+   private:
+    PrimitiveType scalar_type_;
+    Dimensions dims_;
+    int64 max_vectorization_width_;
+    int64 max_vector_count_;
+    int64 min_vectorization_width_;
+    int64 tile_size_m_;
+    int64 tile_size_k_;
+  };
+
+  // Creates an instance of TiledSmallGemmEmitter that matrix-multiplies
+  // `lhs` with `rhs` and stores the result in `result`.
+  explicit TiledSmallGemmEmitter(Config config, llvm::Value* lhs,
+                                 llvm::Value* rhs, llvm::Value* result,
+                                 llvm::IRBuilder<>* b)
+      : lhs_(lhs),
+        rhs_(rhs),
+        result_(result),
+        config_(config),
+        b_(b),
+        ksl_(b_) {
+    CHECK(max_vectorization_width() > 0 &&
+          IsPowerOfTwo(static_cast<uint64>(max_vectorization_width())));
+    CHECK_GT(max_vector_count(), 0);
+    CHECK(min_vectorization_width() > 0 &&
+          IsPowerOfTwo(static_cast<uint64>(min_vectorization_width())));
+    CHECK_GE(max_vectorization_width(), min_vectorization_width());
+    CHECK_GT(tile_size_k(), 0);
+  }
+
+  void Emit();
+
+ private:
+  // The HandleResiduesOnX helpers split the iteration space for dimension X
+  // into a multiple of the tile size on dimension X and an epilogue.  These
+  // helpers ultimately call into `EmitTiledGemm` for emitting the
+  // tiled GEMM kernel.
+
+  void HandleResiduesOnN();
+  void HandleResiduesOnK(VectorSupportLibrary* vsl, llvm::Value* n_start,
+                         llvm::Value* n_end);
+  void HandleResiduesOnM(VectorSupportLibrary* vsl, int64 tile_size_k,
+                         llvm::Value* k_start, llvm::Value* k_end,
+                         llvm::Value* n_start, llvm::Value* n_end);
+
+  // This emits a tiled GEMM kernel.  For a detailed description see the comment
+  // on the implementation.
+  void EmitTiledGemm(VectorSupportLibrary* vsl, int64 tile_size_k,
+                     llvm::Value* k_start, llvm::Value* k_end,
+                     llvm::Value* n_start, llvm::Value* n_end,
+                     int64 tile_size_m, llvm::Value* m_start,
+                     llvm::Value* m_end);
+
+  llvm::Value* GetInt64(int64 value) { return b_->getInt64(value); }
+
+  Config config() const { return config_; }
+  Dimensions dims() const { return config().dims(); }
+
+  int64 max_vectorization_width() const {
+    return config().max_vectorization_width();
+  }
+  int64 max_vector_count() const { return config().max_vector_count(); }
+  int64 min_vectorization_width() const {
+    return config().min_vectorization_width();
+  }
+  int64 tile_size_m() const { return config().tile_size_m(); }
+  int64 tile_size_k() const { return config().tile_size_k(); }
+  PrimitiveType scalar_type() const { return config().scalar_type(); }
+
+  llvm::Value* lhs_;
+  llvm::Value* rhs_;
+  llvm::Value* result_;
+  Config config_;
+
+  llvm::IRBuilder<>* b_;
+  KernelSupportLibrary ksl_;
+};
+
+void TiledSmallGemmEmitter::Emit() { HandleResiduesOnN(); }
+
+void TiledSmallGemmEmitter::HandleResiduesOnN() {
+  // We can only iterate the `n` dimension for an extent that is divisible by
+  // the vectorization width.  So we emit an outer loop that first processes the
+  // largest extent in `n` that is divisible by max_vectorization_width, then
+  // the largest remaining extent that is divisible by max_vectorization_width /
+  // 2 etc.
+
+  int64 current_vectorization_width =
+      max_vector_count() * max_vectorization_width();
+  int64 current_vector_count = max_vector_count();
+
+  int64 n_start = 0;
+  while (n_start != dims().n() &&
+         current_vectorization_width >= min_vectorization_width()) {
+    int64 n_end = dims().n() - (dims().n() % current_vectorization_width);
+    if (n_start != n_end) {
+      VectorSupportLibrary vsl(scalar_type(), current_vectorization_width, b_,
+                               "gemm");
+      HandleResiduesOnK(&vsl, GetInt64(n_start), GetInt64(n_end));
+      n_start = n_end;
+    }
+    if (current_vector_count == 1) {
+      current_vectorization_width /= 2;
+    } else {
+      current_vector_count--;
+      current_vectorization_width =
+          current_vector_count * max_vectorization_width();
+    }
+  }
+
+  if (n_start != dims().n()) {
+    VectorSupportLibrary vsl(scalar_type(), 1, b_, "gemm");
+    ksl_.For("epi.n", n_start, dims().n(), 1, [&](llvm::Value* n_i) {
+      llvm::Value* n_i_next = b_->CreateAdd(n_i, b_->getInt64(1));
+      HandleResiduesOnK(&vsl, n_i, n_i_next);
+    });
+  }
+}
+
+void TiledSmallGemmEmitter::HandleResiduesOnK(VectorSupportLibrary* vsl,
+                                              llvm::Value* n_start,
+                                              llvm::Value* n_end) {
+  int64 k_start = 0;
+  int64 k_end = dims().k() - (dims().k() % tile_size_k());
+  if (k_end != k_start) {
+    HandleResiduesOnM(vsl, tile_size_k(), GetInt64(k_start), GetInt64(k_end),
+                      n_start, n_end);
+    k_start = k_end;
+  }
+
+  if (k_start != dims().k()) {
+    HandleResiduesOnM(vsl, dims().k() - k_start, GetInt64(k_start),
+                      GetInt64(dims().k()), n_start, n_end);
+  }
+}
+
+void TiledSmallGemmEmitter::HandleResiduesOnM(
+    VectorSupportLibrary* vsl, int64 tile_size_k, llvm::Value* k_start,
+    llvm::Value* k_end, llvm::Value* n_start, llvm::Value* n_end) {
+  const int64 m_end = dims().m() - dims().m() % tile_size_m();
+  EmitTiledGemm(vsl, tile_size_k, k_start, k_end, n_start, n_end, tile_size_m(),
+                GetInt64(0), GetInt64(m_end));
+
+  if (m_end != dims().m()) {
+    EmitTiledGemm(vsl, tile_size_k, k_start, k_end, n_start, n_end,
+                  dims().m() - m_end, GetInt64(m_end), GetInt64(dims().m()));
+  }
+}
+
+// The loop structure is:
+//
+// Iterate over dimension M as m:
+//   Iterate over dimension N as n:
+//     Iterate over dimension K as k:
+//       OutputTile[m,n] += Dot(LhsTile[m,k], RhsTile[k,n])
+//
+// I.e. a just a tiled version of a "naive" GEMM.
+//
+// The tiling scheme is as follows:
+//
+// Let the LHS be:
+//
+//   +----+----+----+
+//   | a0 | b0 | c0 | .
+//   +----+----+----+ .
+//   | a1 | b1 | c1 | .
+//   +----+----+----+
+//     ..     ..
+//
+// and the RHS be:
+//
+//   +----+----+----+----+
+//   | p0 | p1 | p2 | p3 | .
+//   +----+----+----+----+ .
+//   | q0 | q1 | q2 | q3 | .
+//   +----+----+----+----+
+//   | r0 | r1 | r2 | r3 | .
+//   +----+----+----+----+ .
+//     ......    ......
+//
+// and let tile_size_m=2, tile_size_k=3 and the vector width (implicitly denoted
+// by `vsl`) be 4.  Then we want to matrix multiply this tile to get a [2,4]
+// matrix that we can increment the result matrix by.
+//
+// First broadcast the rows row in LHS to 3 vectors of width 4, giving us a rank
+// 3 array, L, of dimension [2,3,4]:
+//
+//       L[0,_,_]           *      L[1,_,_]
+//                          *
+//   +----+----+----+----+  *  +----+----+----+----+
+//   | a0 | a0 | a0 | a0 |  *  | a1 | a1 | a1 | a1 |
+//   +----+----+----+----+  *  +----+----+----+----+
+//   | b0 | b0 | b0 | b0 |  *  | b1 | b1 | b1 | b1 |
+//   +----+----+----+----+  *  +----+----+----+----+
+//   | c0 | c0 | c0 | c0 |  *  | c1 | c1 | c1 | c1 |
+//   +----+----+----+----+  *  +----+----+----+----+
+//
+//
+// Then we FMA L[0,_,_] with the RHS to get the first row of the result and
+// L[1,_,_] with the RHS to get the second row of the result.  For example,
+// L[0,_,_] is computed as:
+//
+//   +----+----+----+----+   +----+----+----+----+
+//   | a0 | a0 | a0 | a0 | * | p0 | p1 | p2 | p3 |   +
+//   +----+----+----+----+   +----+----+----+----+
+//
+//   +----+----+----+----+   +----+----+----+----+
+//   | b0 | b0 | b0 | b0 | * | q0 | q1 | q2 | q3 |   +
+//   +----+----+----+----+   +----+----+----+----+
+//
+//   +----+----+----+----+   +----+----+----+----+
+//   | c0 | c0 | c0 | c0 | * | r0 | r1 | r2 | r3 |
+//   +----+----+----+----+   +----+----+----+----+
+//
+// to get:
+//
+//   +-------------------+-------------------+-------------------+---------
+//   | a0*p0+b0*q0+c0*r0 | a0*p1+b0*q1+c0*r1 | a0*p2+b0*q2+c0*r2 |  ...
+//   +-------------------+-------------------+-------------------+---------
+void TiledSmallGemmEmitter::EmitTiledGemm(
+    VectorSupportLibrary* vsl, int64 tile_size_k, llvm::Value* k_start,
+    llvm::Value* k_end, llvm::Value* n_start, llvm::Value* n_end,
+    int64 tile_size_m, llvm::Value* m_start, llvm::Value* m_end) {
+  ksl_.For("dot.m", m_start, m_end, tile_size_m, [&](llvm::Value* m_i) {
+    MemoryTile result_memory_tile(vsl, b_, /*matrix=*/result_,
+                                  /*matrix_size_along_minor_dim=*/dims().n(),
+                                  /*major_dim_offset=*/m_i,
+                                  /*tile_size_along_major_dim=*/tile_size_m);
+    MemoryTile lhs_memory_tile(vsl, b_, /*matrix=*/lhs_,
+                               /*matrix_size_along_minor_dim=*/dims().k(),
+                               /*major_dim_offset=*/m_i,
+                               /*tile_size_along_major_dim=*/tile_size_m);
+    ksl_.For(
+        "dot.n", n_start, n_end, vsl->vector_size(), [&](llvm::Value* n_i) {
+          TileVariable result_tile_var(vsl, result_memory_tile.LoadTile(n_i));
+          ksl_.For("dot.k", k_start, k_end, tile_size_k, [&](llvm::Value* k_i) {
+            MemoryTile rhs_memory_tile(vsl, b_, rhs_, dims().n(), k_i,
+                                       tile_size_k);
+            std::vector<std::vector<llvm::Value*>> lhs_tile =
+                lhs_memory_tile.LoadBroadcastTile(k_i, tile_size_k);
+            std::vector<llvm::Value*> rhs_tile = rhs_memory_tile.LoadTile(n_i);
+            std::vector<llvm::Value*> result_tile = result_tile_var.Get();
+            for (int64 r_m_i = 0; r_m_i < tile_size_m; r_m_i++) {
+              for (int64 r_k_i = 0; r_k_i < tile_size_k; r_k_i++) {
+                result_tile[r_m_i] =
+                    vsl->MulAdd(lhs_tile[r_m_i][r_k_i], rhs_tile[r_k_i],
+                                result_tile[r_m_i]);
+              }
+            }
+            result_tile_var.Set(result_tile);
+          });
+
+          result_memory_tile.StoreTile(result_tile_var.Get(), n_i);
+        });
+  });
+}
+
+}  // namespace
+
+void EmitRowMajorGemv(PrimitiveType scalar_type, int64 tile_rows,
+                      int64 tile_cols, int64 m, int64 k, llvm::Value* lhs,
+                      llvm::Value* rhs, llvm::Value* addend,
+                      llvm::Value* result, llvm::IRBuilder<>* b,
+                      bool enable_fast_math, bool optimize_for_size) {
+  RowMajorMatrixVectorProductEmitter::Config config(
+      /*scalar_type=*/scalar_type,
+      /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols,
+      /*m=*/m, /*k=*/k, /*has_addend=*/addend != nullptr);
+
+  KernelSupportLibrary::EmitAndCallOutlinedKernel(
+      /*enable_fast_math=*/enable_fast_math,
+      /*optimize_for_size=*/optimize_for_size, b, config.GetCacheKey(), lhs,
+      rhs, addend, result,
+      [&](llvm::Value* lhs, llvm::Value* rhs, llvm::Value* addend,
+          llvm::Value* result) {
+        RowMajorMatrixVectorProductEmitter emitter(config, lhs, rhs, addend,
+                                                   result, b);
+        emitter.Emit();
+      });
+}
+
+void EmitColumnMajorGemv(PrimitiveType scalar_type, int64 tile_rows,
+                         int64 tile_cols, int64 m, int64 k, llvm::Value* lhs,
+                         llvm::Value* rhs, llvm::Value* addend,
+                         llvm::Value* result, llvm::IRBuilder<>* b,
+                         bool enable_fast_math, bool optimize_for_size) {
+  ColumnMajorMatrixVectorProductEmitter::Config config(
+      /*scalar_type=*/scalar_type,
+      /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols,
+      /*m=*/m, /*k=*/k, /*has_addend=*/addend != nullptr);
+
+  KernelSupportLibrary::EmitAndCallOutlinedKernel(
+      /*enable_fast_math=*/enable_fast_math,
+      /*optimize_for_size=*/optimize_for_size, b, config.GetCacheKey(), lhs,
+      rhs, addend, result,
+      [&](llvm::Value* lhs, llvm::Value* rhs, llvm::Value* addend,
+          llvm::Value* result) {
+        ColumnMajorMatrixVectorProductEmitter emitter(config, lhs, rhs, addend,
+                                                      result, b);
+        emitter.Emit();
+      });
+}
+
+void EmitSmallGemm(PrimitiveType scalar_type, int64 m, int64 k, int64 n,
+                   int64 max_vectorization_width, int64 max_vector_count,
+                   int64 min_vectorization_width, int64 tile_size_m,
+                   int64 tile_size_k, llvm::Value* lhs, llvm::Value* rhs,
+                   llvm::Value* result, llvm::IRBuilder<>* b,
+                   bool enable_fast_math, bool optimize_for_size) {
+  TiledSmallGemmEmitter::Config config(
+      /*scalar_type=*/scalar_type,
+      TiledSmallGemmEmitter::Dimensions{/*m=*/m, /*k=*/k, /*n=*/n},
+      /*max_vectorization_width=*/max_vectorization_width,
+      /*max_vector_count=*/max_vector_count,
+      /*min_vectorization_width=*/min_vectorization_width,
+      /*tile_size_m=*/tile_size_m, /*tile_size_k=*/tile_size_k);
+
+  KernelSupportLibrary::EmitAndCallOutlinedKernel(
+      /*enable_fast_math=*/enable_fast_math,
+      /*optimize_for_size=*/optimize_for_size, b, config.GetCacheKey(), lhs,
+      rhs, result,
+      [&](llvm::Value* lhs, llvm::Value* rhs, llvm::Value* result) {
+        TiledSmallGemmEmitter small_gemm_emitter(config, /*lhs=*/lhs,
+                                                 /*rhs=*/rhs,
+                                                 /*result=*/result, b);
+        small_gemm_emitter.Emit();
+      });
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h b/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a82326cc3704bce8c122261383249c60eda1f3a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TILED_DOT_EMITTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TILED_DOT_EMITTER_H_
+
+#include "llvm/IR/IRBuilder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace cpu {
+
+// These routines emit LLVM IR implementing tiled GEMM and GEMV routines.
+
+void EmitRowMajorGemv(PrimitiveType scalar_type, tensorflow::int64 tile_rows,
+                      tensorflow::int64 tile_cols, tensorflow::int64 m,
+                      tensorflow::int64 k, llvm::Value* lhs, llvm::Value* rhs,
+                      llvm::Value* addend, llvm::Value* result,
+                      llvm::IRBuilder<>* b, bool enable_fast_math,
+                      bool optimize_for_size);
+
+void EmitColumnMajorGemv(PrimitiveType scalar_type, tensorflow::int64 tile_rows,
+                         tensorflow::int64 tile_cols, tensorflow::int64 m,
+                         tensorflow::int64 k, llvm::Value* lhs,
+                         llvm::Value* rhs, llvm::Value* addend,
+                         llvm::Value* result, llvm::IRBuilder<>* b,
+                         bool enable_fast_math, bool optimize_for_size);
+
+void EmitSmallGemm(PrimitiveType scalar_type, tensorflow::int64 m,
+                   tensorflow::int64 k, tensorflow::int64 n,
+                   tensorflow::int64 max_vectorization_width,
+                   tensorflow::int64 max_vector_count,
+                   tensorflow::int64 min_vectorization_width,
+                   tensorflow::int64 tile_size_m, tensorflow::int64 tile_size_k,
+                   llvm::Value* lhs, llvm::Value* rhs, llvm::Value* result,
+                   llvm::IRBuilder<>* b, bool enable_fast_math,
+                   bool optimize_for_size);
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TILED_DOT_EMITTER_H_
diff --git a/tensorflow/compiler/xla/service/dot_decomposer.cc b/tensorflow/compiler/xla/service/dot_decomposer.cc
index b2ba2617902104bfea06713332fa1c2aedea536d..855424067d26d4968270e5f24b11f5a053b70a55 100644
--- a/tensorflow/compiler/xla/service/dot_decomposer.cc
+++ b/tensorflow/compiler/xla/service/dot_decomposer.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/dot_decomposer.h"
 
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -156,29 +158,187 @@ Status DecomposeBatchDot(HloInstruction* dot) {
   return computation->ReplaceInstruction(dot, new_dot);
 }
 
+// Convert a dot into a canonical form where non-contracting and contracting
+// dimensions are reshaped together and batch dimensions are the most major
+// dimensions. The requires transposing and reshapes the lhs and rhs and
+// reshaping the output batch to the original shape.
+Status CanonicalizeDot(HloInstruction* original_dot) {
+  auto computation = original_dot->parent();
+  const auto& original_dnums = original_dot->dot_dimension_numbers();
+  const int64 num_batch_dims = original_dnums.lhs_batch_dimensions_size();
+  const int64 num_contracting_dims =
+      original_dnums.lhs_contracting_dimensions_size();
+
+  const auto& lhs_shape = original_dot->operand(0)->shape();
+  const int64 lhs_rank = lhs_shape.rank();
+  const int64 num_lhs_non_contracting_dims =
+      lhs_rank - num_batch_dims - num_contracting_dims;
+
+  std::vector<int64> lhs_non_contracting_dims;
+  lhs_non_contracting_dims.reserve(num_lhs_non_contracting_dims);
+  int64 lhs_contracting_size = 1;
+  int64 lhs_non_contracting_size = 1;
+  std::vector<int64> batch_dim_sizes;
+  batch_dim_sizes.reserve(num_batch_dims);
+  for (int64 i = 0; i < lhs_rank; ++i) {
+    if (absl::c_linear_search(original_dnums.lhs_contracting_dimensions(), i)) {
+      lhs_contracting_size *= lhs_shape.dimensions(i);
+    } else if (absl::c_linear_search(original_dnums.lhs_batch_dimensions(),
+                                     i)) {
+      batch_dim_sizes.push_back(lhs_shape.dimensions(i));
+    } else {
+      lhs_non_contracting_dims.push_back(i);
+      lhs_non_contracting_size *= lhs_shape.dimensions(i);
+    }
+  }
+  // The canonical form of the lhs is
+  // [BatchDims, NonContractingDims, ContractingsDims]
+  std::vector<int64> lhs_transpose;
+  lhs_transpose.reserve(lhs_rank);
+  lhs_transpose.insert(lhs_transpose.end(),
+                       original_dnums.lhs_batch_dimensions().begin(),
+                       original_dnums.lhs_batch_dimensions().end());
+  lhs_transpose.insert(lhs_transpose.end(), lhs_non_contracting_dims.begin(),
+                       lhs_non_contracting_dims.end());
+  lhs_transpose.insert(lhs_transpose.end(),
+                       original_dnums.lhs_contracting_dimensions().begin(),
+                       original_dnums.lhs_contracting_dimensions().end());
+  HloInstruction* transposed_lhs =
+      computation->AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::PermuteDimensions(InversePermutation(lhs_transpose),
+                                       lhs_shape),
+          original_dot->mutable_operand(0), lhs_transpose));
+  std::vector<int64> lhs_reshape_dims = batch_dim_sizes;
+  lhs_reshape_dims.push_back(lhs_non_contracting_size);
+  lhs_reshape_dims.push_back(lhs_contracting_size);
+  // Reshape the contracting and non-contracting dimensions together.
+  HloInstruction* reshaped_lhs =
+      computation->AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(lhs_shape.element_type(), lhs_reshape_dims),
+          transposed_lhs));
+
+  const auto& rhs_shape = original_dot->operand(1)->shape();
+  const int64 rhs_rank = rhs_shape.rank();
+  const int64 num_rhs_non_contracting_dims =
+      rhs_rank - num_batch_dims - num_contracting_dims;
+  std::vector<int64> rhs_non_contracting_dims;
+  rhs_non_contracting_dims.reserve(num_rhs_non_contracting_dims);
+  int64 rhs_non_contracting_size = 1;
+  int64 rhs_contracting_size = 1;
+  for (int64 i = 0; i < rhs_rank; ++i) {
+    if (absl::c_linear_search(original_dnums.rhs_contracting_dimensions(), i)) {
+      rhs_contracting_size *= rhs_shape.dimensions(i);
+    } else if (!absl::c_linear_search(original_dnums.rhs_batch_dimensions(),
+                                      i)) {
+      rhs_non_contracting_dims.push_back(i);
+      rhs_non_contracting_size *= rhs_shape.dimensions(i);
+    }
+  }
+
+  // The canonical form of the rhs is
+  // [BatchDims, ContractingsDims, NonContractingDims]
+  std::vector<int64> rhs_transpose;
+  rhs_transpose.reserve(rhs_rank);
+  rhs_transpose.insert(rhs_transpose.end(),
+                       original_dnums.rhs_batch_dimensions().begin(),
+                       original_dnums.rhs_batch_dimensions().end());
+  rhs_transpose.insert(rhs_transpose.end(),
+                       original_dnums.rhs_contracting_dimensions().begin(),
+                       original_dnums.rhs_contracting_dimensions().end());
+  rhs_transpose.insert(rhs_transpose.end(), rhs_non_contracting_dims.begin(),
+                       rhs_non_contracting_dims.end());
+  HloInstruction* transposed_rhs =
+      computation->AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::PermuteDimensions(InversePermutation(rhs_transpose),
+                                       rhs_shape),
+          original_dot->mutable_operand(1), rhs_transpose));
+
+  std::vector<int64> rhs_reshape_dims = batch_dim_sizes;
+  rhs_reshape_dims.push_back(rhs_contracting_size);
+  rhs_reshape_dims.push_back(rhs_non_contracting_size);
+  // Reshape the contracting and non-contracting dimensions together.
+  HloInstruction* reshaped_rhs =
+      computation->AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(rhs_shape.element_type(), rhs_reshape_dims),
+          transposed_rhs));
+
+  std::vector<int64> dot_dims = batch_dim_sizes;
+  dot_dims.push_back(lhs_non_contracting_size);
+  dot_dims.push_back(rhs_non_contracting_size);
+
+  DotDimensionNumbers dot_dnums;
+  for (int64 i = 0; i < num_batch_dims; ++i) {
+    dot_dnums.add_lhs_batch_dimensions(i);
+    dot_dnums.add_rhs_batch_dimensions(i);
+  }
+  dot_dnums.add_lhs_contracting_dimensions(num_batch_dims + 1);
+  dot_dnums.add_rhs_contracting_dimensions(num_batch_dims);
+
+  HloInstruction* dot = computation->AddInstruction(HloInstruction::CreateDot(
+      ShapeUtil::MakeShape(original_dot->shape().element_type(), dot_dims),
+      reshaped_lhs, reshaped_rhs, dot_dnums, original_dot->precision_config()));
+
+  return computation->ReplaceInstruction(
+      original_dot, computation->AddInstruction(HloInstruction::CreateReshape(
+                        original_dot->shape(), dot)));
+}
+
 }  // namespace
 
 StatusOr<bool> DotDecomposer::Run(HloModule* module) {
   XLA_VLOG_LINES(2, "DotDecomposer ENTRY\n" + module->ToString());
-  // Gather all batch Dot operations.
-  std::vector<HloInstruction*> batch_dots;
+  // Gather all Non-canonical Dot operations.
+  std::vector<HloInstruction*> non_canonical_dots;
   for (auto* computation : module->MakeNonfusionComputations()) {
     for (auto* instruction : computation->instructions()) {
       if (instruction->opcode() != HloOpcode::kDot) {
         continue;
       }
       const DotDimensionNumbers& dnums = instruction->dot_dimension_numbers();
-      if (dnums.lhs_batch_dimensions_size() > 0 && decompose_batch_dot_) {
-        batch_dots.push_back(instruction);
+      // A dot it not canonical if there are more than one contracting
+      // dimension.
+      if (dnums.lhs_contracting_dimensions_size() > 1) {
+        non_canonical_dots.push_back(instruction);
+        continue;
+      }
+      if (dnums.lhs_batch_dimensions().empty()) {
+        continue;
+      }
+      std::vector<int64> canonical_batch_dims(
+          dnums.lhs_batch_dimensions_size());
+      absl::c_iota(canonical_batch_dims, 0);
+      if (!absl::c_equal(dnums.lhs_batch_dimensions(), canonical_batch_dims) ||
+          !absl::c_equal(dnums.rhs_batch_dimensions(), canonical_batch_dims)) {
+        non_canonical_dots.push_back(instruction);
       }
     }
   }
-  // Decompose each batch Dot in 'batch_dots'.
   bool changed = false;
-  for (auto* dot : batch_dots) {
-    TF_RETURN_IF_ERROR(DecomposeBatchDot(dot));
+  for (auto* dot : non_canonical_dots) {
+    TF_RETURN_IF_ERROR(CanonicalizeDot(dot));
     changed = true;
   }
+
+  if (decompose_batch_dot_) {
+    std::vector<HloInstruction*> batch_dots;
+    for (auto* computation : module->MakeNonfusionComputations()) {
+      for (auto* instruction : computation->instructions()) {
+        if (instruction->opcode() != HloOpcode::kDot) {
+          continue;
+        }
+        const DotDimensionNumbers& dnums = instruction->dot_dimension_numbers();
+        if (!dnums.lhs_batch_dimensions().empty()) {
+          batch_dots.push_back(instruction);
+        }
+      }
+    }
+    // Decompose each batch Dot in 'batch_dots'.
+
+    for (auto* dot : batch_dots) {
+      TF_RETURN_IF_ERROR(DecomposeBatchDot(dot));
+      changed = true;
+    }
+  }
   XLA_VLOG_LINES(2, "DotDecompose EXIT\n" + module->ToString());
   return changed;
 }
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
index f2b7e9a186135d6246d2dc6c5fb3fa0b70145d10..2b158d7a6ec510ce4cbc56bddc5cca71ac4f14f4 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
@@ -433,7 +433,7 @@ Status DynamicDimensionInferenceVisitor::ForEachOperandDynamicDimension(
 /* static */
 StatusOr<DynamicDimensionInference> DynamicDimensionInference::Run(
     HloModule* module) {
-  VLOG(0) << "Param Config " << module->dynamic_parameter_binding().ToString();
+  VLOG(2) << "Param Config " << module->dynamic_parameter_binding().ToString();
   DynamicDimensionInference inference(module);
   TF_RETURN_IF_ERROR(inference.AnalyzeDynamicDimensions());
   return inference;
diff --git a/tensorflow/compiler/xla/service/dynamic_index_splitter.cc b/tensorflow/compiler/xla/service/dynamic_index_splitter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ede4a5afdb7d02b31b9d0839fc8716f9b814e544
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_index_splitter.cc
@@ -0,0 +1,97 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
+
+#include <map>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace xla {
+
+StatusOr<bool> DynamicIndexSplitter::Run(HloModule* module) {
+  bool changed = false;
+
+  std::vector<HloComputation*> computations =
+      module->MakeNonfusionComputations();
+  for (HloComputation* computation : computations) {
+    for (HloInstruction* dynamic_op : computation->MakeInstructionPostOrder()) {
+      switch (dynamic_op->opcode()) {
+        case HloOpcode::kDynamicSlice:
+        case HloOpcode::kDynamicUpdateSlice:
+          break;
+        default:
+          continue;
+      }
+      auto parent = dynamic_op->parent();
+      bool is_update = dynamic_op->opcode() == HloOpcode::kDynamicUpdateSlice;
+      int64 index_operand_number = Cast<HloDynamicIndexInstruction>(dynamic_op)
+                                       ->first_index_operand_number();
+      auto index_operand = dynamic_op->mutable_operand(index_operand_number);
+      if (ShapeUtil::IsScalar(index_operand->shape())) {
+        // This DS/DUS already uses scalar indices.
+        continue;
+      }
+      TF_RET_CHECK(index_operand->shape().rank() == 1);
+      int64 num_indices = index_operand->shape().dimensions(0);
+      if (num_indices == 0) {
+        // If the operand dimension is 0, directly replace R0 DS/DUS with the
+        // operand (for DS) or update (for DUS).
+        if (is_update) {
+          TF_CHECK_OK(parent->ReplaceInstruction(
+              dynamic_op, dynamic_op->mutable_operand(1)));
+        } else {
+          TF_CHECK_OK(parent->ReplaceInstruction(
+              dynamic_op, dynamic_op->mutable_operand(0)));
+        }
+        changed = true;
+        continue;
+      }
+      auto index_element_type = index_operand->shape().element_type();
+      std::vector<HloInstruction*> index_array;
+      for (int64 dim = 0; dim < num_indices; ++dim) {
+        auto slice = parent->AddInstruction(HloInstruction::CreateSlice(
+            ShapeUtil::MakeShape(index_element_type, {1}), index_operand, {dim},
+            {dim + 1}, {1}));
+        auto bitcast = parent->AddInstruction(HloInstruction::CreateReshape(
+            ShapeUtil::MakeShape(index_element_type, {}), slice));
+        index_array.push_back(bitcast);
+      }
+      auto new_dynamic_op =
+          is_update
+              ? HloInstruction::CreateDynamicUpdateSlice(
+                    dynamic_op->shape(), dynamic_op->mutable_operand(0),
+                    dynamic_op->mutable_operand(1), absl::MakeSpan(index_array))
+              : HloInstruction::CreateDynamicSlice(
+                    dynamic_op->shape(), dynamic_op->mutable_operand(0),
+                    absl::MakeSpan(index_array),
+                    dynamic_op->dynamic_slice_sizes());
+      TF_CHECK_OK(parent->ReplaceWithNewInstruction(dynamic_op,
+                                                    std::move(new_dynamic_op)));
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_index_splitter.h b/tensorflow/compiler/xla/service/dynamic_index_splitter.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c12e3a4af287ad2272a08ba54cd99c2cad9d451
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_index_splitter.h
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_INDEX_SPLITTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_INDEX_SPLITTER_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Convert R1 index operands to DynamicSlice and DynamicUpdateSlice ops into
+// separate scalars.
+class DynamicIndexSplitter : public HloModulePass {
+ public:
+  DynamicIndexSplitter() = default;
+  absl::string_view name() const override { return "dynamic-index-splitter"; }
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_INDEX_SPLITTER_H_
diff --git a/tensorflow/compiler/xla/service/dynamic_index_splitter_test.cc b/tensorflow/compiler/xla/service/dynamic_index_splitter_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..98029d1faff7d669730f6b66e38fcefece70f0eb
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_index_splitter_test.cc
@@ -0,0 +1,134 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+class DynamicIndexSplitterTest : public HloTestBase {};
+
+TEST_F(DynamicIndexSplitterTest, DynamicSlice) {
+  const char* const kDynamicSlice = R"(
+    HloModule DynamicSlice_module
+
+    ENTRY entry (operand: s32[4,5,6], indices: s32[3]) -> s32[1,1,1] {
+      operand = s32[4,5,6] parameter(0)
+      indices = s32[3] parameter(1)
+      ROOT dynamic-slice = s32[1,1,1] dynamic-slice(operand, indices), dynamic_slice_sizes={1,1,1}
+    }
+  )";
+
+  HloModuleConfig config;
+  DebugOptions debug_options = config.debug_options();
+  debug_options.set_xla_allow_scalar_index_dynamic_ops(true);
+  config.set_debug_options(debug_options);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(kDynamicSlice, config));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          DynamicIndexSplitter().Run(module.get()));
+  EXPECT_TRUE(changed);
+  ASSERT_THAT(module->entry_computation()->root_instruction(),
+              op::DynamicSlice(op::Parameter(0),
+                               op::Reshape(op::Slice(op::Parameter(1))),
+                               op::Reshape(op::Slice(op::Parameter(1))),
+                               op::Reshape(op::Slice(op::Parameter(1)))));
+
+  for (int i = 0; i < 3; ++i) {
+    const HloInstruction* slice = module->entry_computation()
+                                      ->root_instruction()
+                                      ->operand(i + 1)
+                                      ->operand(0);
+    EXPECT_EQ(slice->slice_starts(0), i);
+    EXPECT_EQ(slice->slice_limits(0), i + 1);
+  }
+}
+
+TEST_F(DynamicIndexSplitterTest, DynamicUpdateSlice) {
+  const char* const kDynamicUpdateSlice = R"(
+    HloModule DynamicUpdatedSlice_module
+
+    ENTRY entry (operand: s32[4,5,6], indices: s32[3], update: s32[1,1,1]) -> s32[4,5,6] {
+      operand = s32[4,5,6] parameter(0)
+      indices = s32[3] parameter(1)
+      update = s32[1,1,1] parameter(2)
+      ROOT dynamic-update-slice = s32[4,5,6] dynamic-update-slice(operand, update, indices)
+    }
+  )";
+
+  HloModuleConfig config;
+  DebugOptions debug_options = config.debug_options();
+  debug_options.set_xla_allow_scalar_index_dynamic_ops(true);
+  config.set_debug_options(debug_options);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseHloString(kDynamicUpdateSlice, config));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          DynamicIndexSplitter().Run(module.get()));
+  EXPECT_TRUE(changed);
+  ASSERT_THAT(module->entry_computation()->root_instruction(),
+              op::DynamicUpdateSlice(op::Parameter(0), op::Parameter(2),
+                                     op::Reshape(op::Slice(op::Parameter(1))),
+                                     op::Reshape(op::Slice(op::Parameter(1))),
+                                     op::Reshape(op::Slice(op::Parameter(1)))));
+
+  for (int i = 0; i < 3; ++i) {
+    const HloInstruction* slice = module->entry_computation()
+                                      ->root_instruction()
+                                      ->operand(i + 2)
+                                      ->operand(0);
+    EXPECT_EQ(slice->slice_starts(0), i);
+    EXPECT_EQ(slice->slice_limits(0), i + 1);
+  }
+}
+
+TEST_F(DynamicIndexSplitterTest, AlreadyScalar) {
+  const char* const kDynamicSlice = R"(
+    HloModule DynamicSlice_module
+
+    ENTRY entry (operand: s32[4,5,6], index.0: s32[], index.1: s32[], index.2: s32[]) -> s32[1,1,1] {
+      operand = s32[4,5,6] parameter(0)
+      index.0 = s32[] parameter(1)
+      index.1 = s32[] parameter(2)
+      index.2 = s32[] parameter(3)
+      ROOT dynamic-slice = s32[1,1,1] dynamic-slice(operand, index.0, index.1, index.2), dynamic_slice_sizes={1,1,1}
+    }
+  )";
+
+  HloModuleConfig config;
+  DebugOptions debug_options = config.debug_options();
+  debug_options.set_xla_allow_scalar_index_dynamic_ops(true);
+  config.set_debug_options(debug_options);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(kDynamicSlice, config));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          DynamicIndexSplitter().Run(module.get()));
+  EXPECT_FALSE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::DynamicSlice(op::Parameter(0), op::Parameter(1),
+                               op::Parameter(2), op::Parameter(3)));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_padder.cc b/tensorflow/compiler/xla/service/dynamic_padder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4db280f817141bd52e3a5b9564600a618f81aeac
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_padder.cc
@@ -0,0 +1,161 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/xla/service/dynamic_padder.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+
+namespace {
+
+// ChooseIdentityValue looks at the instruction and returns a identity value
+// which, when padded, doesn't change the result of the instruction.
+//
+// nullopt is returned if padding doesn't need to be reset.
+StatusOr<HloInstruction*> ChooseIdentityValue(HloInstruction* inst) {
+  HloComputation* comp = inst->parent();
+  // Padding on elementwise operation doesn't affect the result of the effective
+  // data.
+  if (inst->IsElementwise()) {
+    return nullptr;
+  }
+
+  switch (inst->opcode()) {
+    case HloOpcode::kReduce:
+    case HloOpcode::kReduceWindow: {
+      // Because of the way we do reduce, we already require the `init` operand
+      // of hlo reduce instruction to be identity value. Here we reuse the
+      // operand.
+      return inst->mutable_operand(1);
+    }
+
+    case HloOpcode::kConvolution:
+    case HloOpcode::kDot: {
+      // Use 0 as padding value for convolution and dot.
+      PrimitiveType ptype = inst->shape().element_type();
+      return comp->AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::Zero(ptype)));
+    }
+
+    case HloOpcode::kPad: {
+      return inst->mutable_operand(1);
+    }
+    case HloOpcode::kParameter:
+    case HloOpcode::kGetDimensionSize:
+    case HloOpcode::kReshape:
+    case HloOpcode::kTuple:
+    case HloOpcode::kAllReduce:
+    case HloOpcode::kBroadcast:
+      return nullptr;
+    default:
+      return UnimplementedStrCat("Unimplimented padding for instruction: ",
+                                 inst->ToString());
+  }
+}
+
+}  // namespace
+
+StatusOr<bool> DynamicPadder::Run(HloModule* module) {
+  bool changed = false;
+  VLOG(2) << "Pre DynamicPadder HLO:";
+  XLA_VLOG_LINES(2, module->ToString());
+  TF_ASSIGN_OR_RETURN(DynamicDimensionInference dynamic_dimension_inference,
+                      DynamicDimensionInference::Run(module));
+
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* inst : computation->instructions()) {
+      for (int64 operand_num = 0; operand_num < inst->operand_count();
+           ++operand_num) {
+        HloInstruction* operand = inst->mutable_operand(operand_num);
+        if (!operand->shape().IsArray()) {
+          continue;
+        }
+        for (int64 dim = 0; dim < operand->shape().rank(); ++dim) {
+          HloInstruction* dynamic_size =
+              dynamic_dimension_inference.GetDynamicSize(operand, {}, dim);
+          if (dynamic_size == nullptr) {
+            continue;
+          }
+          VLOG(1) << "Has dynamic dimension of operand" << operand_num << " @"
+                  << dim;
+          TF_ASSIGN_OR_RETURN(HloInstruction * identity_value,
+                              ChooseIdentityValue(inst));
+          if (identity_value == nullptr) {
+            continue;
+          }
+
+          // For each dimension, first generates a mask representing the
+          // effective area of data and padded area of data using iota and
+          // dynamic_size. For example, given a dimension of 7 elements and 5
+          // effective elements:
+          //
+          // iota = [0, 1, 2, 3, 4, 5, 6]
+          // broadcast_dynamic_size = [5, 5, 5, 5, 5, 5, 5]
+          // mask = lt(iota, broadcast_dynamic_size) = [t, t, t, t, t, f, f]
+          //
+          // Once the mask is generated, the input data is then padded using the
+          // mask and pad value.
+          //
+          const Shape mask_shape =
+              ShapeUtil::ChangeElementType(operand->shape(), xla::U32);
+          const Shape pred_shape =
+              ShapeUtil::ChangeElementType(operand->shape(), xla::PRED);
+          HloInstruction* iota = computation->AddInstruction(
+              HloInstruction::CreateIota(mask_shape, dim));
+
+          HloInstruction* broadcasted_effective_size =
+              computation->AddInstruction(HloInstruction::CreateBroadcast(
+                  mask_shape, dynamic_size, {}));
+          HloInstruction* pred = computation->AddInstruction(
+              HloInstruction::CreateBinary(pred_shape, HloOpcode::kLt, iota,
+                                           broadcasted_effective_size));
+
+          HloInstruction* broadcasted_identity_value =
+              computation->AddInstruction(HloInstruction::CreateBroadcast(
+                  operand->shape(), identity_value, {}));
+          HloInstruction* padded =
+              computation->AddInstruction(HloInstruction::CreateTernary(
+                  operand->shape(), HloOpcode::kSelect, pred, operand,
+                  broadcasted_identity_value));
+          TF_RETURN_IF_ERROR(inst->ReplaceOperandWith(operand_num, padded));
+          operand = inst->mutable_operand(operand_num);
+          changed = true;
+        }
+      }
+    }
+  }
+  HloDCE dce;
+  TF_ASSIGN_OR_RETURN(changed, dce.Run(module));
+  VLOG(2) << "Post DynamicPadder HLO:";
+  XLA_VLOG_LINES(2, module->ToString());
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_padder.h b/tensorflow/compiler/xla/service/dynamic_padder.h
new file mode 100644
index 0000000000000000000000000000000000000000..509269f7f56746fa5516ad917a04221587c6dcca
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_padder.h
@@ -0,0 +1,44 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PADDER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PADDER_H_
+
+#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// With bounded shapes, only part of the shape contains effective data and the
+// rest contains padded data, whose value can be anything depending on the
+// source of the data. When a bounded shape is directly consumed by an
+// instruction that collapses dimensions (reduce for example), the padding data
+// would affect result of the instruction.
+//
+// DynamicPadder uses DynamicDimensionInference to detect bounded shapes in a
+// hlo module, it then inserts certain instructions to reset the padding into an
+// identity value so that in doesn't affect the result of subsequent
+// instruction. For example, it'd reset the padding to 0 before a bounded shape
+// is consumed by a reduce-sum.
+class DynamicPadder : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "dynamic_padder"; }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PADDER_H_
diff --git a/tensorflow/compiler/xla/service/dynamic_padder_test.cc b/tensorflow/compiler/xla/service/dynamic_padder_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..55a11286e4596d87c330315322cae704fc5cd707
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_padder_test.cc
@@ -0,0 +1,152 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dynamic_padder.h"
+
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_runner.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace op = xla::testing::opcode_matchers;
+
+namespace xla {
+namespace {
+
+class DynamicPadderTest : public HloTestBase {
+ protected:
+  DynamicPadderTest() : HloTestBase() { module_ = CreateNewVerifiedModule(); }
+
+  StatusOr<bool> RunPadder() {
+    hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before padder");
+
+    DynamicPadder padder;
+
+    return padder.Run(module_.get());
+  }
+
+  void ExpectPadded(const HloInstruction* inst) {
+    EXPECT_THAT(inst,
+                op::Select(op::Lt(op::Iota(), op::Broadcast(op::Parameter())),
+                           ::testing::_, op::Broadcast()));
+  }
+
+  HloComputation* GetScalarAddComputation() {
+    auto embedded_builder = HloComputation::Builder("add");
+    auto lhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        0, ShapeUtil::MakeShape(F32, {}), "lhs"));
+    auto rhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        1, ShapeUtil::MakeShape(F32, {}), "rhs"));
+    embedded_builder.AddInstruction(
+        HloInstruction::CreateBinary(lhs->shape(), HloOpcode::kAdd, lhs, rhs));
+    return module_->AddEmbeddedComputation(embedded_builder.Build());
+  }
+
+  std::unique_ptr<HloModule> module_;
+  const Shape scalar_shape_ = ShapeUtil::MakeShape(U32, {});
+};
+
+TEST_F(DynamicPadderTest, ReduceTest) {
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
+  auto reduce_shape = ShapeUtil::MakeShape(F32, {2});
+
+  auto data_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, input_shape, "data_param"));
+  builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "size_param"));
+
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(input_shape, HloOpcode::kNegate, data_param));
+
+  auto init = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+
+  auto reduce = builder.AddInstruction(HloInstruction::CreateReduce(
+      reduce_shape, negate, init, {0, 2}, GetScalarAddComputation()));
+
+  module_->AddEntryComputation(builder.Build());
+
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  TF_ASSERT_OK(RunPadder().status());
+
+  ExpectPadded(reduce->operand(0));
+}
+
+TEST_F(DynamicPadderTest, ConvolutionTest) {
+  auto builder = HloComputation::Builder(TestName());
+  constexpr int xdim = 3;
+  constexpr int ydim = 2;
+  constexpr int zdim = 1;
+  auto xy_shape = ShapeUtil::MakeShape(F32, {xdim, ydim});
+  auto yz_shape = ShapeUtil::MakeShape(F32, {ydim, zdim});
+  auto zx_shape = ShapeUtil::MakeShape(F32, {zdim, xdim});
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, xy_shape, "A"));
+  auto* b_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, yz_shape, "B"));
+  builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/2, scalar_shape_, "size_param"));
+
+  auto dnums = XlaBuilder::CreateDefaultConvDimensionNumbers(0);
+
+  dnums.set_kernel_input_feature_dimension(0);
+  dnums.set_kernel_output_feature_dimension(1);
+  dnums.set_input_batch_dimension(0);
+  dnums.set_output_batch_dimension(1);
+  dnums.set_output_feature_dimension(0);
+
+  Window window;
+
+  auto* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
+      zx_shape, a_param, b_param, /*feature_group_count=*/1,
+      /*batch_group_count=*/1, window, dnums,
+      HloTestBase::DefaultPrecisionConfig(2)));
+
+  module_->AddEntryComputation(builder.Build());
+
+  // Set up dynamic parameter binding for non-contracting dimension.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+
+  // Set up binding for contracting dimensions.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  TF_ASSERT_OK(RunPadder().status());
+
+  ExpectPadded(conv->operand(0));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 6f928fcbaab53d804d580ba31f1b7fad0ed0bdc5..f84c115e0abec378f0401a15e6bd381983d0ff34 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -1758,9 +1758,18 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicSlice(
     auto index_typed_const = [&](uint64 c) -> llvm::Constant* {
       return llvm::ConstantInt::get(index_type, c);
     };
-    llvm_ir::IrArray::Index dim_index(1, index_typed_const(i));
-    TF_ASSIGN_OR_RETURN(llvm::Value * start_index_value,
-                        operand_to_generator.at(hlo->operand(1))(dim_index));
+    // TODO(b/118437727): Remove the R1 path.
+    llvm::Value* start_index_value;
+    if (hlo->operand(1)->shape().rank() == 1) {
+      llvm_ir::IrArray::Index dim_index(1, index_typed_const(i));
+      TF_ASSIGN_OR_RETURN(start_index_value,
+                          operand_to_generator.at(hlo->operand(1))(dim_index));
+    } else {
+      llvm_ir::IrArray::Index zero_index(index_type);
+      TF_ASSIGN_OR_RETURN(
+          start_index_value,
+          operand_to_generator.at(hlo->operand(1 + i))(zero_index));
+    }
 
     // Clamp the start index so that the sliced portion fits in the operand:
     // start_index = clamp(start_index, 0, operand_dim_size - output_dim_size)
@@ -1905,9 +1914,19 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicUpdateSlice(
     auto index_typed_const = [&](uint64 c) -> llvm::Constant* {
       return llvm::ConstantInt::get(index_type, c);
     };
-    llvm_ir::IrArray::Index dim_index(1, index_typed_const(i));
-    TF_ASSIGN_OR_RETURN(llvm::Value * start_index_value,
-                        operand_to_generator.at(start_hlo)(dim_index));
+
+    llvm::Value* start_index_value;
+    // TODO(b/118437727): Remove the R1 path.
+    if (hlo->operand(2)->shape().rank() == 1) {
+      llvm_ir::IrArray::Index dim_index(1, index_typed_const(i));
+      TF_ASSIGN_OR_RETURN(start_index_value,
+                          operand_to_generator.at(hlo->operand(2))(dim_index));
+    } else {
+      llvm_ir::IrArray::Index zero_index(index_type);
+      TF_ASSIGN_OR_RETURN(
+          start_index_value,
+          operand_to_generator.at(hlo->operand(2 + i))(zero_index));
+    }
 
     // Clamp the start index so that the update region fits in the operand.
     // start_index = clamp(start_index, 0, input_dim_size - update_dim_size)
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 6c23f921f40cac0dc5df08494dc1b63e6d1d5e93..2be5d918eeabf4db812425ccc2a9c4ba067ddaac 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -3,6 +3,11 @@
 
 load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
+load(
+    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "tf_cuda_tests_tags",
+)
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 licenses(["notice"])  # Apache 2.0
 
@@ -24,12 +29,6 @@ filegroup(
     ]),
 )
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
-    "tf_cuda_tests_tags",
-)
-
 xla_proto_library(
     name = "backend_configs",
     srcs = ["backend_configs.proto"],
@@ -94,8 +93,8 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_reachability",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -135,6 +134,8 @@ cc_library(
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/compiler/xla/service/llvm_ir:tuple_ops",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm//:core",
@@ -263,7 +264,9 @@ cc_library(
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
     ],
@@ -362,6 +365,7 @@ cc_library(
         "//tensorflow/core/platform/default/build_config:stream_executor_cuda",  # build_cleaner: keep
         "//tensorflow/stream_executor",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -695,6 +699,8 @@ cc_library(
         "//tensorflow/compiler/xla/service:call_inliner",
         "//tensorflow/compiler/xla/service:conditional_simplifier",
         "//tensorflow/compiler/xla/service:convolution_group_converter",
+        "//tensorflow/compiler/xla/service:dot_decomposer",
+        "//tensorflow/compiler/xla/service:dynamic_index_splitter",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
         "//tensorflow/compiler/xla/service:hlo",
@@ -725,6 +731,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:regexp_internal",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor/cuda:cuda_diagnostics",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
index 528209abc75777440163c2e1512658b8ad36315b..eb59ee5a1d47b6b706ef3f53a76069b3538eb6b7 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -57,16 +58,16 @@ StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
 
     // If buffer #i's address is already registered (e.g. external arguments or
     // result buffers), use that registered buffer.
-    if (registered_buffers_.count(i)) {
-      se::DeviceMemoryBase address = FindOrDie(registered_buffers_, i);
-      if (reinterpret_cast<uintptr_t>(address.opaque()) % expected_alignment !=
+    if (se::DeviceMemoryBase* address =
+            tensorflow::gtl::FindOrNull(registered_buffers_, i)) {
+      if (reinterpret_cast<uintptr_t>(address->opaque()) % expected_alignment !=
           0) {
         return InternalError(
             "Address of registered buffer %d must be a multiple of %x, but "
             "was %p",
-            i, kEntryParameterAlignBytes, address.opaque());
+            i, kEntryParameterAlignBytes, address->opaque());
       }
-      buffer_allocations->SetBuffer(i, FindOrDie(registered_buffers_, i));
+      buffer_allocations->SetBuffer(i, *address);
       continue;
     }
 
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
index 14186b8faa68ad8492ea4863fcd7bd746e2eae48..9413ac2cff7c8d3ec4be6662569c580060bf1173 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <set>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
@@ -52,7 +53,8 @@ class BufferAllocations {
         DeviceMemoryAllocator* memory_allocator);
 
    private:
-    std::map<BufferAllocation::Index, se::DeviceMemoryBase> registered_buffers_;
+    absl::flat_hash_map<BufferAllocation::Index, se::DeviceMemoryBase>
+        registered_buffers_;
   };
 
   ~BufferAllocations();
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
index 6d6780fa1c7b0c636eb771c40e74f074cd8c4c4b..309b0aca64954e64509d731dce28ce9d8da4ee43 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
@@ -146,7 +146,8 @@ tensorflow::mutex_lock LockGpu(const se::StreamExecutor* stream_exec) {
 // trouble, but we may want to revisit this if we ever find a model where
 // caching would speed up compilation a lot.
 StatusOr<CudnnConvAlgorithmPicker::AutotuneResult>
-CudnnConvAlgorithmPicker::PickBestAlgorithm(HloCustomCallInstruction* instr) {
+CudnnConvAlgorithmPicker::PickBestAlgorithm(
+    const HloCustomCallInstruction* instr) {
   // TODO(timshen): for now only check fp16. It can be expanded to other types,
   // with some work on the HLO routines.
   const bool cross_check_enabled =
@@ -249,12 +250,13 @@ CudnnConvAlgorithmPicker::PickBestAlgorithm(HloCustomCallInstruction* instr) {
     VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for "
             << instr->ToString();
 
-    backend_config.set_algorithm(alg.algo_id());
-    backend_config.set_tensor_ops_enabled(alg.tensor_ops_enabled());
-    TF_RETURN_IF_ERROR(instr->set_backend_config(backend_config));
+    // Use assignment instead of brace-list to make GCC 4.9 happy.
+    RunConvOptions options;
+    options.profile_result = &profile_result;
+    options.algo_override = alg;
     bool launch_ok =
         RunCudnnConv(instr, absl::MakeSpan(operand_buffers), result_buffer,
-                     &scratch_allocator, &stream, &profile_result)
+                     &scratch_allocator, &stream, options)
             .ok();
 
     if (launch_ok && profile_result.is_valid()) {
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
index 642af787afc71586d722ecc7e529ed8b3fa64d33..4991db0948589e479a202f4082d96df275f6e088 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
@@ -56,7 +56,8 @@ class CudnnConvAlgorithmPicker : public HloModulePass {
 
   StatusOr<bool> RunOnComputation(HloComputation* computation);
   StatusOr<bool> RunOnInstruction(HloInstruction* instr);
-  StatusOr<AutotuneResult> PickBestAlgorithm(HloCustomCallInstruction* instr);
+  StatusOr<AutotuneResult> PickBestAlgorithm(
+      const HloCustomCallInstruction* instr);
 
   se::StreamExecutor* stream_exec_;                   // never null
   DeviceMemoryAllocator* allocator_;                  // may be null
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
index 3425e1b4942aaf1011ba1bf1c50dd7e79c1f9807..b628f27f4b2ba8ccf17fd531d8a0c25cb99d9396 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
@@ -395,32 +395,36 @@ Status RunCudnnConv(const HloCustomCallInstruction* conv,
                     absl::Span<se::DeviceMemoryBase> operand_buffers,
                     se::DeviceMemoryBase result_buffer,
                     se::DeviceMemoryBase scratch_buf, se::Stream* stream,
-                    se::dnn::ProfileResult* profile_result) {
+                    RunConvOptions options) {
   ScratchBufAllocator scratch_allocator(scratch_buf);
   return RunCudnnConv(conv, operand_buffers, result_buffer, &scratch_allocator,
-                      stream, profile_result);
+                      stream, options);
 }
 
 Status RunCudnnConv(const HloCustomCallInstruction* conv,
                     absl::Span<se::DeviceMemoryBase> operand_buffers,
                     se::DeviceMemoryBase result_buffer,
                     se::ScratchAllocator* scratch_allocator, se::Stream* stream,
-                    se::dnn::ProfileResult* profile_result) {
+                    RunConvOptions options) {
   TF_ASSIGN_OR_RETURN(CudnnConvParams params,
                       GetCudnnConvParams(conv, operand_buffers, result_buffer));
 
+  if (options.algo_override) {
+    params.algorithm = AlgorithmConfig(*options.algo_override);
+  }
+
   PrimitiveType output_primitive_type =
       conv->shape().tuple_shapes(0).element_type();
   switch (output_primitive_type) {
     case F16:
       return RunCudnnConvImpl<Eigen::half>(params, scratch_allocator, stream,
-                                           profile_result);
+                                           options.profile_result);
     case F32:
       return RunCudnnConvImpl<float>(params, scratch_allocator, stream,
-                                     profile_result);
+                                     options.profile_result);
     case F64:
       return RunCudnnConvImpl<double>(params, scratch_allocator, stream,
-                                      profile_result);
+                                      options.profile_result);
     default:
       LOG(FATAL) << ShapeUtil::HumanString(*params.output_shape);
   }
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h
index edbc75a94a1238540390b93f0fa5217852c7781f..25b2461ca61251c6cb7b89b1f91da0f1636a3647 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h
@@ -28,6 +28,14 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+struct RunConvOptions {
+  // Nullable output-parameter pointer for profiling results.
+  se::dnn::ProfileResult* profile_result = nullptr;
+
+  // Use this algorithm, instead of the one from the instruction.
+  absl::optional<se::dnn::AlgorithmDesc> algo_override;
+};
+
 // This file contains low-level routines for running cudnn convolutions.
 
 // Calls into cudnn to run the specified convolution.
@@ -46,13 +54,13 @@ Status RunCudnnConv(const HloCustomCallInstruction* conv,
                     absl::Span<se::DeviceMemoryBase> operand_buffers,
                     se::DeviceMemoryBase result_buffer,
                     se::DeviceMemoryBase scratch_buf, se::Stream* stream,
-                    se::dnn::ProfileResult* profile_result = nullptr);
+                    RunConvOptions = {});
 
 Status RunCudnnConv(const HloCustomCallInstruction* conv,
                     absl::Span<se::DeviceMemoryBase> operand_buffers,
                     se::DeviceMemoryBase result_buffer,
                     se::ScratchAllocator* scratch_allocator, se::Stream* stream,
-                    se::dnn::ProfileResult* profile_result = nullptr);
+                    RunConvOptions = {});
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
index 9634c92786ad4dd0d62bae9cbb087b4ef6b5866f..69aaaceca112364a4fd562f6a5eff1629fd3fc54 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
@@ -45,10 +46,10 @@ void HloToIrBindings::EmitBasePointersForHlos(
 
   // An HLO can have duplicated operands. This data structure remembers which
   // operand HLOs are already bound to avoid rebinding the same HLO.
-  std::set<const HloInstruction*> already_bound_for_this_function;
+  absl::flat_hash_set<const HloInstruction*> already_bound_for_this_function;
   auto arg_iter = function->arg_begin();
   for (const HloInstruction* io_hlo : io_hlos) {
-    if (!already_bound_for_this_function.count(io_hlo)) {
+    if (!already_bound_for_this_function.contains(io_hlo)) {
       if (!is_nested_ && io_hlo->opcode() == HloOpcode::kGetTupleElement) {
         BindHloToIrValue(*io_hlo, EmitGetTupleElement(io_hlo, &*arg_iter));
       } else {
@@ -63,7 +64,7 @@ void HloToIrBindings::EmitBasePointersForHlos(
   temp_buffer_base_->setName("temp_buffer");
 
   for (const HloInstruction* non_io_hlo : non_io_hlos) {
-    if (already_bound_for_this_function.count(non_io_hlo)) {
+    if (already_bound_for_this_function.contains(non_io_hlo)) {
       continue;
     }
     already_bound_for_this_function.insert(non_io_hlo);
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
index c0edae530cedba45c897b07b7b9cc72eaaab397c..f57b594e9c18078a3bbbf4d2b4db7e989c4edfdd 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <unordered_map>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
@@ -61,7 +62,7 @@ class HloToIrBindings {
 
   // Returns whether `hlo` is bound to an LLVM IR value.
   bool BoundToIrValue(const HloInstruction& hlo) const {
-    return base_ptrs_.count(&hlo);
+    return base_ptrs_.contains(&hlo);
   }
 
   llvm::Value* GetTempBufferBase() const { return temp_buffer_base_; }
@@ -110,7 +111,8 @@ class HloToIrBindings {
   // For an instruction that generates multiple outputs, the root will be a
   // tuple shape. The IrArray for each element output is stored in the subnode
   // in the ShapeTree.
-  std::unordered_map<const HloInstruction*, ShapeTree<llvm::Value*>> base_ptrs_;
+  absl::flat_hash_map<const HloInstruction*, ShapeTree<llvm::Value*>>
+      base_ptrs_;
 
   // The address of the memory block that contains all temporary buffers.
   llvm::Value* temp_buffer_base_ = nullptr;
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 5d25a032a99b2e477ba81e9d8ea31b2e1ddf93ec..caccb1889977de4601889d4cc39ac74418ac6652 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -154,20 +154,17 @@ bool IsReductionToVector(const HloInstruction& reduce) {
   const HloInstruction* input = reduce.operand(0);
   std::vector<int64> dims_to_keep;
   for (int64 dim = 0; dim < input->shape().dimensions().size(); ++dim) {
-    if (!std::count(reduce.dimensions().begin(), reduce.dimensions().end(),
-                    dim)) {
+    if (!absl::c_linear_search(reduce.dimensions(), dim)) {
       dims_to_keep.push_back(dim);
     }
   }
   return LayoutUtil::AreDimensionsConsecutive(input->shape().layout(),
                                               dims_to_keep) &&
-         ShapeUtil::Equal(reduce.shape(), ShapeUtil::FilterDimensions(
-                                              [&dims_to_keep](int64 dim) {
-                                                return std::count(
-                                                    dims_to_keep.begin(),
-                                                    dims_to_keep.end(), dim);
-                                              },
-                                              input->shape()));
+         ShapeUtil::Equal(
+             reduce.shape(),
+             ShapeUtil::FilterDimensions(
+                 [&](int64 dim) { return absl::c_count(dims_to_keep, dim); },
+                 input->shape()));
 }
 
 // This emits a device-side call to
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 063d493d90a2330c535ed4a16d063d6db9b49cd1..34ddeb1d417729e467cebba4986a763dc685c05b 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -88,6 +88,9 @@ namespace xla {
 namespace gpu {
 
 using llvm_ir::KernelMappingScheme;
+using EmitElementFunction =
+    std::function<void(const llvm_ir::IrArray::Index& index, llvm::Value* y_loc,
+                       llvm::Value* x_loc, int64 x_iter_num)>;
 
 namespace {
 
@@ -1506,10 +1509,10 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
                     return !allocation->is_constant();
                   });
 
-  std::sort(non_constant_buffers.begin(), non_constant_buffers.end(),
-            [](const BufferAllocation* a, const BufferAllocation* b) {
-              return a->index() < b->index();
-            });
+  absl::c_sort(non_constant_buffers,
+               [](const BufferAllocation* a, const BufferAllocation* b) {
+                 return a->index() < b->index();
+               });
 
   llvm::Function* kernel = BuildKernelPrototype(*inst, non_constant_buffers);
 
@@ -2133,53 +2136,86 @@ int IrEmitterUnnested::ConstructInputReducedShapeAndCastInputIrArrayToShape(
 
 namespace {
 
-void EmitFullElementalTile(
-    const KernelMappingScheme* mapping_scheme,
-    const IrArray::Index& tile_origin_index, const string& loop_name,
-    KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y,
-    llvm::Value* x, llvm::Type* index_ty,
-    const std::function<void(const IrArray::Index&, llvm::Value*,
-                             llvm::Value*)>& emit_elem_function) {
+std::tuple<llvm::Value*, int64> GetStartOffsetAndStepForX(
+    int64 tile_size_x, int64 num_threads_x,
+    const KernelMappingScheme* mapping_scheme, llvm::IRBuilder<>* builder,
+    llvm::Value* x, llvm::Type* index_ty) {
+  llvm::Value* start_offset_x;
+  int64 step_x;
+  if (mapping_scheme->DilatedX()) {
+    start_offset_x = x;
+    step_x = num_threads_x;
+  } else {
+    start_offset_x = builder->CreateMul(
+        x, llvm::ConstantInt::get(index_ty, tile_size_x / num_threads_x));
+    step_x = 1;
+  }
+  return std::make_tuple(start_offset_x, step_x);
+}
+
+void EmitFullElementalTile(const KernelMappingScheme* mapping_scheme,
+                           const IrArray::Index& tile_origin_index,
+                           const string& loop_name, KernelSupportLibrary* ksl,
+                           llvm::IRBuilder<>* builder, llvm::Value* y,
+                           llvm::Value* x, llvm::Type* index_ty,
+                           const EmitElementFunction& emit_elem_function) {
   int64 num_threads_x = mapping_scheme->GetNumberOfThreadsForDimensionX();
   int64 num_threads_y = mapping_scheme->GetNumberOfThreadsForDimensionY();
   int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX();
   int64 tile_size_y = mapping_scheme->GetTileSizeForDimensionY();
+
+  llvm::Value* start_offset_x;
+  int64 step_x;
+  std::tie(start_offset_x, step_x) = GetStartOffsetAndStepForX(
+      tile_size_x, num_threads_x, mapping_scheme, builder, x, index_ty);
+  IrArray::Index source_idx =
+      tile_origin_index.AddOffsetToDim(y, KernelMappingScheme::DimY, builder)
+          .AddOffsetToDim(start_offset_x, KernelMappingScheme::DimX, builder);
   ksl->For(loop_name + "_y", /*start=*/llvm::ConstantInt::get(index_ty, 0),
            /*end=*/llvm::ConstantInt::get(index_ty, tile_size_y),
            /*step=*/llvm::ConstantInt::get(index_ty, num_threads_y),
            [&](llvm::Value* y_indvar) {
-             IrArray::Index source_idx_y = tile_origin_index.AddOffsetToDim(
+             IrArray::Index source_idx_y = source_idx.AddOffsetToDim(
                  y_indvar, KernelMappingScheme::DimY, builder);
              llvm::Value* y_loc = builder->CreateAdd(y_indvar, y);
-             for (int64 j = 0; j < tile_size_x; j += num_threads_x) {
-               IrArray::Index source_idx = source_idx_y.AddOffsetToDim(
-                   llvm::ConstantInt::get(index_ty, j),
+
+             for (int64 j = 0; j < tile_size_x / num_threads_x; j++) {
+               IrArray::Index source_idx_y_x = source_idx_y.AddOffsetToDim(
+                   llvm::ConstantInt::get(index_ty, j * step_x),
                    KernelMappingScheme::DimX, builder);
-               llvm::Value* x_loc =
-                   builder->CreateAdd(llvm::ConstantInt::get(index_ty, j), x);
-               emit_elem_function(source_idx, y_loc, x_loc);
+               llvm::Value* x_loc = builder->CreateAdd(
+                   llvm::ConstantInt::get(index_ty, j * step_x),
+                   start_offset_x);
+               emit_elem_function(source_idx_y_x, y_loc, x_loc, j);
              }
            });
 }
 
-void EmitPartialElementalTile(
-    const KernelMappingScheme* mapping_scheme,
-    const IrArray::Index& tile_origin_index, const string& loop_name,
-    KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y,
-    llvm::Value* x, llvm::Value* tile_height, llvm::Value* tile_width,
-    llvm::Type* index_ty,
-    const std::function<void(const IrArray::Index&, llvm::Value*,
-                             llvm::Value*)>& emit_elem_function) {
+void EmitPartialElementalTile(const KernelMappingScheme* mapping_scheme,
+                              const IrArray::Index& tile_origin_index,
+                              const string& loop_name,
+                              KernelSupportLibrary* ksl,
+                              llvm::IRBuilder<>* builder, llvm::Value* y,
+                              llvm::Value* x, llvm::Value* tile_height,
+                              llvm::Value* tile_width, llvm::Type* index_ty,
+                              const EmitElementFunction& emit_elem_function) {
   int64 num_threads_x = mapping_scheme->GetNumberOfThreadsForDimensionX();
   int64 num_threads_y = mapping_scheme->GetNumberOfThreadsForDimensionY();
   int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX();
 
-  for (int64 j = 0; j < tile_size_x; j += num_threads_x) {
-    IrArray::Index source_idx =
-        tile_origin_index.AddOffsetToDim(llvm::ConstantInt::get(index_ty, j),
-                                         KernelMappingScheme::DimX, builder);
-    llvm::Value* x_loc =
-        builder->CreateAdd(llvm::ConstantInt::get(index_ty, j), x);
+  llvm::Value* start_offset_x;
+  int64 step_x;
+  std::tie(start_offset_x, step_x) = GetStartOffsetAndStepForX(
+      tile_size_x, num_threads_x, mapping_scheme, builder, x, index_ty);
+  IrArray::Index source_idx =
+      tile_origin_index.AddOffsetToDim(y, KernelMappingScheme::DimY, builder)
+          .AddOffsetToDim(start_offset_x, KernelMappingScheme::DimX, builder);
+  for (int64 j = 0; j < tile_size_x / num_threads_x; j++) {
+    IrArray::Index source_idx_x =
+        source_idx.AddOffsetToDim(llvm::ConstantInt::get(index_ty, j * step_x),
+                                  KernelMappingScheme::DimX, builder);
+    llvm::Value* x_loc = builder->CreateAdd(
+        llvm::ConstantInt::get(index_ty, j * step_x), start_offset_x);
 
     ksl->If(
         loop_name + "_x_in_tile", builder->CreateICmpULT(x_loc, tile_width),
@@ -2199,14 +2235,13 @@ void EmitPartialElementalTile(
               /*step=*/llvm::ConstantInt::get(index_ty, num_threads_y),
               [&](llvm::Value* y_indvar) {
                 llvm::Value* y_loc = builder->CreateAdd(y_indvar, y);
-                ksl->If(
-                    loop_name + "_y_in_tile",
-                    builder->CreateICmpULT(y_loc, tile_height), [&] {
-                      emit_elem_function(
-                          source_idx.AddOffsetToDim(
-                              y_indvar, KernelMappingScheme::DimY, builder),
-                          y_loc, x_loc);
-                    });
+                ksl->If(loop_name + "_y_in_tile",
+                        builder->CreateICmpULT(y_loc, tile_height), [&] {
+                          emit_elem_function(
+                              source_idx_x.AddOffsetToDim(
+                                  y_indvar, KernelMappingScheme::DimY, builder),
+                              y_loc, x_loc, j);
+                        });
               });
         });
   }
@@ -2225,8 +2260,7 @@ void EmitTiledElementalCodeWithBoundsCheck(
     const IrArray::Index& tile_origin_index, const string& loop_name,
     KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y,
     llvm::Value* x, llvm::Value* tile_height, llvm::Value* tile_width,
-    const std::function<void(const IrArray::Index&, llvm::Value*,
-                             llvm::Value*)>& emit_elem_function) {
+    const EmitElementFunction& emit_elem_function) {
   int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX();
   int64 tile_size_y = mapping_scheme->GetTileSizeForDimensionY();
   llvm::Type* index_ty = tile_width->getType();
@@ -2262,7 +2296,7 @@ void EmitTiledElementalCodeWithBoundsCheck(
 void IrEmitterUnnested::EmitTileElementForCopy(
     HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
     const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
-    llvm::Value* x_loc) {
+    llvm::Value* x_loc, int64 /*x_iter_num*/) {
   llvm_ir::TiledParameterInfo* tiled_param_info =
       kernel_info->GetTiledParameterInfo();
   // TODO(jlebar): Add AA metadata to this load.
@@ -2292,7 +2326,7 @@ void IrEmitterUnnested::EmitTileElementForCopy(
 void IrEmitterUnnested::EmitTileElementForFusion(
     HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
     const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
-    llvm::Value* x_loc) {
+    llvm::Value* x_loc, int64 /*x_iter_num*/) {
   llvm_ir::TiledParameterInfo* tiled_param_info =
       kernel_info->GetTiledParameterInfo();
   std::vector<IrArray> output_arrays = ConstructIrArrayForOutputs(*hlo);
@@ -2393,6 +2427,23 @@ class ReductionCodegenInfo : public IrEmitterUnnested::KernelCodegenInfo {
                             : llvm_ir::KernelMappingScheme::DimX;
   }
 
+  int GetNumberOfPartialResults() const {
+    if (IsRowReduction()) {
+      return 1;
+    }
+    int64 num_thread = mapping_scheme_->GetNumberOfThreadsForDimensionX();
+    int64 tile_size = mapping_scheme_->GetTileSizeForDimensionX();
+    CHECK_EQ(tile_size % num_thread, 0);
+    return tile_size / num_thread;
+  }
+
+  int GetPartialResultIndex(int64 x_iter_num) const {
+    if (IsRowReduction()) {
+      return 0;
+    }
+    return x_iter_num;
+  }
+
  private:
   AddressVector partial_result_addresses_;
   AddressVector reduction_input_addresses_;
@@ -2452,10 +2503,11 @@ void IrEmitterUnnested::EmitPrologueForOneReduction(
   llvm::AllocaInst* reduction_input_address = Alloca(element_type);
   reduction_input_addresses->push_back(reduction_input_address);
 
+  int num_partial_results = reduction_info->GetNumberOfPartialResults();
   AddressVector* partial_result_addresses =
       reduction_info->GetMutablePartialResultAddresses();
   llvm::AllocaInst* partial_result_address =
-      Alloca(element_type, /*ArraySize=*/nullptr,
+      Alloca(element_type, /*ArraySize=*/b_.getInt32(num_partial_results),
              "partial_reduction_result." + llvm::Twine(reduce_idx));
   partial_result_addresses->push_back(partial_result_address);
 
@@ -2478,7 +2530,9 @@ void IrEmitterUnnested::EmitPrologueForOneReduction(
             .EmitReadArrayElement(IrArray::Index(b_.getInt32Ty()), &b_);
   }
 
-  Store(init_ir_value, partial_result_address);
+  for (int i = 0; i < num_partial_results; ++i) {
+    Store(init_ir_value, InBoundsGEP(partial_result_address, {b_.getInt32(i)}));
+  }
 }
 
 void IrEmitterUnnested::EmitPrologueForReduction(
@@ -2516,10 +2570,14 @@ void IrEmitterUnnested::EmitPrologueForReduction(
                                 std::move(output_shape_index));
   }
 
-  // Allocate stack storage to store the current output linear index and record
-  // the address of the storage.
+  int num_partial_results = reduction_info->GetNumberOfPartialResults();
+
+  // Allocate stack storage to store the linear indices for the current output,
+  // and record the address of the storage.
   reduction_info->SetCurrentOutputLinearIndexAddress(
-      Alloca(reduction_info->GetIndexType()));
+      Alloca(reduction_info->GetIndexType(),
+             /*ArraySize=*/b_.getInt32(num_partial_results),
+             "current_output_linear_index_address"));
 
   if (!reduction_info->IsRowReduction()) {
     llvm::Type* bool_ty = b_.getInt1Ty();
@@ -2589,36 +2647,45 @@ void IrEmitterUnnested::EmitEpilogueForReduction(
     llvm_ir::SetToFirstInsertPoint(if_output_inbound_data.true_block, &b_);
   }
 
+  int num_partial_results = reduction_info->GetNumberOfPartialResults();
+
   // Emit an atomic operation that accumulates the partial reduction to the
   // output element. For row reduction, this is only for lane 0 due to the
   // if-statement emitted above.
   for (int i = 0; i != num_reduces; ++i) {
-    IrArray::Index element_index(
-        /*linear=*/Load(reduction_info->GetCurrentOutputLinearIndexAddress(),
-                        "output_linear_addr"),
-        ShapeUtil::GetSubshape(unnested_hlo->shape(),
-                               reduction_output_shape_indices[i]),
-        &b_);
-    llvm::Value* output_address =
-        GetIrArray(*unnested_hlo, *unnested_hlo,
-                   reduction_output_shape_indices[i])
-            .EmitArrayElementAddress(element_index, &b_,
-                                     "output_element_address");
-    // Do not emit atomic operations if each element in the reduction result is
-    // computed by one block, that is the dimension being reduced has only one
-    // block.
-    const llvm_ir::KernelMappingScheme* mapping_scheme =
-        reduction_info->GetKernelMappingScheme();
-    if (mapping_scheme->GetTileBlockSizeForDimension(
-            llvm_ir::KernelMappingScheme::DimZ) == 1 &&
-        mapping_scheme->GetTileBlockSizeForDimension(
-            reduction_info->GetReducedDimensionEnum()) == 1) {
-      TF_CHECK_OK(EmitCallToNestedComputation(
-          *reducers[i], {output_address, partial_result_addresses[i]},
-          output_address));
-    } else {
-      TF_CHECK_OK(EmitAtomicOperationForNestedComputation(
-          *reducers[i], output_address, partial_result_addresses[i]));
+    for (int j = 0; j < num_partial_results; ++j) {
+      IrArray::Index element_index(
+          /*linear=*/Load(
+              InBoundsGEP(reduction_info->GetCurrentOutputLinearIndexAddress(),
+                          {b_.getInt32(j)}),
+              "output_linear_addr"),
+          ShapeUtil::GetSubshape(unnested_hlo->shape(),
+                                 reduction_output_shape_indices[i]),
+          &b_);
+      llvm::Value* output_address =
+          GetIrArray(*unnested_hlo, *unnested_hlo,
+                     reduction_output_shape_indices[i])
+              .EmitArrayElementAddress(element_index, &b_,
+                                       "output_element_address");
+      // Do not emit atomic operations if each element in the reduction result
+      // is computed by one block, that is the dimension being reduced has only
+      // one block.
+      const llvm_ir::KernelMappingScheme* mapping_scheme =
+          reduction_info->GetKernelMappingScheme();
+      if (mapping_scheme->GetTileBlockSizeForDimension(
+              llvm_ir::KernelMappingScheme::DimZ) == 1 &&
+          mapping_scheme->GetTileBlockSizeForDimension(
+              reduction_info->GetReducedDimensionEnum()) == 1) {
+        TF_CHECK_OK(EmitCallToNestedComputation(
+            *reducers[i],
+            {output_address,
+             InBoundsGEP(partial_result_addresses[i], {b_.getInt32(j)})},
+            output_address));
+      } else {
+        TF_CHECK_OK(EmitAtomicOperationForNestedComputation(
+            *reducers[i], output_address,
+            InBoundsGEP(partial_result_addresses[i], {b_.getInt32(j)})));
+      }
     }
   }
 }
@@ -2626,7 +2693,7 @@ void IrEmitterUnnested::EmitEpilogueForReduction(
 void IrEmitterUnnested::EmitTileElementForReduction(
     HloInstruction* unnested_hlo, const llvm_ir::IrArray::Index& index,
     const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
-    llvm::Value* x_loc) {
+    llvm::Value* x_loc, int64 x_iter_num) {
   VLOG(10) << "Emit tile element for reduce " << unnested_hlo->ToString();
   HloInstruction* reduce_or_tuple = unnested_hlo->opcode() == HloOpcode::kFusion
                                         ? unnested_hlo->fused_expression_root()
@@ -2639,8 +2706,11 @@ void IrEmitterUnnested::EmitTileElementForReduction(
   // Record the linear address for the current reduction.
   const ReductionCodegenInfo* reduction_info =
       dynamic_cast<const ReductionCodegenInfo*>(kernel_info);
+  int partial_result_index = reduction_info->IsRowReduction() ? 0 : x_iter_num;
+
   Store(index[reduction_info->GetKeptDimensionEnum()],
-        reduction_info->GetCurrentOutputLinearIndexAddress());
+        InBoundsGEP(reduction_info->GetCurrentOutputLinearIndexAddress(),
+                    {b_.getInt32(partial_result_index)}));
   if (!reduction_info->IsRowReduction()) {
     llvm::Type* bool_ty = b_.getInt1Ty();
     llvm::AllocaInst* output_inbound_addr =
@@ -2687,6 +2757,13 @@ void IrEmitterUnnested::EmitTileElementForReduction(
       reduction_info->GetKernelMappingScheme()->GetUnnormalizedIndex(
           index,
           GetFirstReduceInstruction(output_instructions)->operand(0)->shape());
+  int num_partial_results = reduction_info->GetNumberOfPartialResults();
+  if (num_partial_results > 1) {
+    // Clear the linear index field of the IrArray::Index to enable the use of
+    // GetElementPointer with array types. This enables the vectorization of
+    // the computation for different partial results.
+    input_index.ClearLinearIndex();
+  }
   absl::Span<llvm::AllocaInst* const> partial_reduction_result_addresses =
       reduction_info->GetPartialResultAddresses();
   absl::Span<llvm::AllocaInst* const> reduction_input_addresses =
@@ -2699,10 +2776,12 @@ void IrEmitterUnnested::EmitTileElementForReduction(
   for (int i = 0; i != reducers.size(); ++i) {
     llvm::Value* const input_ir_value = input_gens[i](input_index).ValueOrDie();
     Store(input_ir_value, reduction_input_addresses[i]);
+    llvm::Value* partial_result_address =
+        InBoundsGEP(partial_reduction_result_addresses[i],
+                    {b_.getInt32(partial_result_index)});
     TF_CHECK_OK(EmitCallToNestedComputation(
-        *reducers[i],
-        {partial_reduction_result_addresses[i], reduction_input_addresses[i]},
-        partial_reduction_result_addresses[i]));
+        *reducers[i], {partial_result_address, reduction_input_addresses[i]},
+        partial_result_address));
   }
 
   // Emit code to generate the output for the non-reduction instructions in the
@@ -2713,8 +2792,8 @@ void IrEmitterUnnested::EmitTileElementForReduction(
 
 // Emits a kernel for the hlo instruction using the given tiling scheme.
 void IrEmitterUnnested::EmitBlock(const TileGenerator& emit_one_tile,
-                                  const KernelCodegenInfo* kernel_info,
-                                  KernelSupportLibrary& ksl,
+                                  KernelCodegenInfo* kernel_info,
+                                  KernelSupportLibrary* ksl,
                                   llvm::Type* index_ty) {
   KernelMappingScheme* mapping_scheme = kernel_info->GetKernelMappingScheme();
   absl::Span<const int64> dims_in_tile = mapping_scheme->GetDimensionsInTiles();
@@ -2747,15 +2826,14 @@ void IrEmitterUnnested::EmitBlock(const TileGenerator& emit_one_tile,
           llvm::Value* num_tiles_in_block =
               Select(ICmpEQ(last_block_for_dim, block_id_for_dim),
                      last_block_size_for_dim, block_size_for_dim);
-
-          ksl.For(loop_name,
-                  /*start=*/index_typed_constant(0),
-                  /*end=*/num_tiles_in_block,
-                  /*step=*/1, [&](llvm::Value* block_dim_induction_var) {
-                    IrArray::Index tile_index = starting_tile.AddOffsetToDim(
-                        block_dim_induction_var, dim_id, &b_);
-                    emit_next_block_dim(tile_index);
-                  });
+          ksl->For(loop_name,
+                   /*start=*/index_typed_constant(0),
+                   /*end=*/num_tiles_in_block,
+                   /*step=*/1, [&](llvm::Value* block_dim_induction_var) {
+                     IrArray::Index tile_index = starting_tile.AddOffsetToDim(
+                         block_dim_induction_var, dim_id, &b_);
+                     emit_next_block_dim(tile_index);
+                   });
         }
       };
 
@@ -2810,7 +2888,8 @@ void IrEmitterUnnested::EmitBlock(const TileGenerator& emit_one_tile,
 // unnested_hlo: The unnested hlo instruction for which the kernel is generated.
 //   Currently, these hlo instructions are supported: kLoop fusion, kCopy.
 // tiled_param_ids: The IDs for the parameters that are 0-2-1 transpose of
-//   other tensors with the same dimensions and need to be tiled and tranposed.
+//   other tensors with the same dimensions and are safe to be tranposed via
+//   the shared memory tranpose implementation.
 // mapping_scheme: The tiling scheme to use.
 // kernel_generator: Contains function objects for code generation, such as
 //   element generator, block prologue and epilogue generators.
@@ -2898,8 +2977,7 @@ LaunchDimensions IrEmitterUnnested::EmitKernel(
   auto emit_tiled_elemental_code_with_bounds_check =
       [&](const IrArray::Index& index, const string& loop_name,
           llvm::Value* tile_height, llvm::Value* tile_width,
-          const std::function<void(const IrArray::Index&, llvm::Value*,
-                                   llvm::Value*)>& emit_elem_function) {
+          const EmitElementFunction& emit_elem_function) {
         EmitTiledElementalCodeWithBoundsCheck(mapping_scheme, index, loop_name,
                                               &ksl, &b_, y, x, tile_height,
                                               tile_width, emit_elem_function);
@@ -2912,10 +2990,6 @@ LaunchDimensions IrEmitterUnnested::EmitKernel(
     const IrArray::Index input_tile_origin(
         Permute({0, 2, 1}, output_tile_origin.multidim()));
 
-    const IrArray::Index input_index =
-        input_tile_origin.AddOffsetToDim(x, KernelMappingScheme::DimX, &b_)
-            .AddOffsetToDim(y, KernelMappingScheme::DimY, &b_);
-
     // If shared memory transpose is needed, wait for all threads to reach this
     // point, lest we copy a value from tile to output before the other thread
     // copies it from input to tile. This is `__syncthreads` in CUDA.
@@ -2925,9 +2999,10 @@ LaunchDimensions IrEmitterUnnested::EmitKernel(
       // Note that tile_width and tile_height are flipped here because we are
       // reading a transposed tile.
       emit_tiled_elemental_code_with_bounds_check(
-          input_index, "input", output_tile_bounds[2], output_tile_bounds[1],
+          input_tile_origin, "input", output_tile_bounds[2],
+          output_tile_bounds[1],
           [&](const IrArray::Index& index, llvm::Value* y_loc,
-              llvm::Value* x_loc) {
+              llvm::Value* x_loc, int64 /*x_iter_num*/) {
             for (int64 id : tiled_param_ids) {
               IrArray& input_in_logical_shape =
                   param_in_reduced_shape_arrays[id];
@@ -2947,18 +3022,15 @@ LaunchDimensions IrEmitterUnnested::EmitKernel(
     llvm_ir::TiledParameterInfo tiled_param_info(param_shmem_buffers, y, x);
     kernel_info->SetTiledParamInfo(&tiled_param_info);
 
-    const IrArray::Index output_index =
-        output_tile_origin.AddOffsetToDim(x, KernelMappingScheme::DimX, &b_)
-            .AddOffsetToDim(y, KernelMappingScheme::DimY, &b_);
-
     // Write to output[index] by emitting code like normal, except that values
     // for the tiled parameters are read from the shmem buffers.
     emit_tiled_elemental_code_with_bounds_check(
-        output_index, "output", output_tile_bounds[1], output_tile_bounds[2],
-        [&](const IrArray::Index& index, llvm::Value* y_loc,
-            llvm::Value* x_loc) {
-          kernel_generator.GetTileElementGenerator()(unnested_hlo, index,
-                                                     kernel_info, y_loc, x_loc);
+        output_tile_origin, "output", output_tile_bounds[1],
+        output_tile_bounds[2],
+        [&](const IrArray::Index& index, llvm::Value* y_loc, llvm::Value* x_loc,
+            int64 x_iter_num) {
+          kernel_generator.GetTileElementGenerator()(
+              unnested_hlo, index, kernel_info, y_loc, x_loc, x_iter_num);
         });
 
     // If a tile block contains multiple tiles and shared memory buffers are
@@ -2976,7 +3048,7 @@ LaunchDimensions IrEmitterUnnested::EmitKernel(
     block_prologue_generator(unnested_hlo, kernel_info);
   }
 
-  EmitBlock(std::move(emit_one_tile), kernel_info, ksl, index_ty);
+  EmitBlock(std::move(emit_one_tile), kernel_info, &ksl, index_ty);
 
   const BlockEpilogueGenerator& block_epilogue_generator =
       kernel_generator.GetBlockEpilogueGenerator();
@@ -2989,7 +3061,10 @@ LaunchDimensions IrEmitterUnnested::EmitKernel(
 
 // Emits a kernel for the given hlo instruction using a tiled 0-2-1 transpose
 // algorithm to improve the memory access patterns for the input parameters
-// with a shape that is a 0-2-1 transpose of the output tensor shape.
+// with a shape that is a 0-2-1 transpose of the output tensor shape. The caller
+// is responsible for making sure that it is safe to apply the shared memory
+// tranpose on the input parameters.
+//
 //
 // For the purpose of tiling, the output tensors have a logical shape of three
 // components 0-2-1 while the relevant input parameters have a logical shape
@@ -3022,17 +3097,19 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
     element_generator = [&](HloInstruction* hlo,
                             const llvm_ir::IrArray::Index& index,
                             const KernelCodegenInfo* kernel_info,
-                            llvm::Value* y_loc, llvm::Value* x_loc) {
-      EmitTileElementForCopy(hlo, index, kernel_info, y_loc, x_loc);
+                            llvm::Value* y_loc, llvm::Value* x_loc,
+                            int64 x_iter_num) {
+      EmitTileElementForCopy(hlo, index, kernel_info, y_loc, x_loc, x_iter_num);
     };
   } else {
     DCHECK_EQ(hlo->opcode(), HloOpcode::kFusion);
-    element_generator = [&](HloInstruction* hlo,
-                            const llvm_ir::IrArray::Index& index,
-                            const KernelCodegenInfo* kernel_info,
-                            llvm::Value* y_loc, llvm::Value* x_loc) {
-      EmitTileElementForFusion(hlo, index, kernel_info, y_loc, x_loc);
-    };
+    element_generator =
+        [&](HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
+            const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
+            llvm::Value* x_loc, int64 x_iter_num) {
+          EmitTileElementForFusion(hlo, index, kernel_info, y_loc, x_loc,
+                                   x_iter_num);
+        };
   }
   KernelCodegenInfo kernel_info(&mapping_scheme);
   KernelCodeGenerator kernel_generator(std::move(element_generator));
@@ -3040,26 +3117,99 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
 }
 
 namespace {
-// Returns true to indicate it is safe to use the tile based shared memory
-// transpose implementation to implement the kernel for the instruction.
+// A recursive function to inspect the users of a parameter to determine
+// whether it's safe for a parameter to participate in a shared-memory
+// transpose.
 //
-// An instruction is not safe for such an implementation if it can change the
-// element order of a tensor without changing the dimension of the tensor, and
-// the instruction has a corresponding elemental_ir_emitter.
-bool IsInstructionSafeForTileBasedTranspose(const HloInstruction* hlo) {
-  auto is_safe_for_tile_based_transpose = [&](const HloInstruction* instr) {
-    HloOpcode opcode = instr->opcode();
-    CHECK_NE(opcode, HloOpcode::kFusion);
-    return (opcode != HloOpcode::kReverse && opcode != HloOpcode::kGather);
-  };
+// Consider a fusion parameter P for which we might want to use a shmem
+// transpose.  If we do, we use a GPU thread block to preload a tile of P with
+// indices [z, y..y+31, x..x+31] to compute an output tile with the same indices
+// cooperatively, where z, y, x are the indices for the normalized input/output
+// tensor (see the document for FindTranspose021 for the definition of
+// normalized tensor for 0-2-1 transpose). This shmem transpose implementation
+// requires that the computation of the output tile only read elements within
+// the preload tile. If this is not true, we can't use a shmem transpose for P.
+//
+// If the computation of output element [z, y, x] only requires the element of
+// P with the same indices, the shmem tranpose implementation can be applied
+// to P safely. This is a sufficient but not necessary condition. We check all
+// the transitive users of P to see if we can find a user that may cause an
+// exception to the situation. If such a user is not found, we conclude that P
+// is safe for shmem transpose.
+//
+// This is trivially true for elementwise operations and some "data-movement"
+// ops like kTuple. However, it's not true for operations that can change the
+// dimensions of the inputs (e.g. pad, slice) and bitcast operation.
+// For example:
+//
+// fused_computation {
+//   param_0 = f32[64,64]{1,0} parameter(0)
+//   ROOT bitcast = f32[64,64]{0,1} bitcast(param_0)
+// }
+// The output element at logical address [0, 63] depends on the input element
+// at logical address [63, 0], which would not be within the shared-memory
+// block.
+//
+// TODO(bixia): In order to extend this for kInput fusion, that is reduction
+// with tranpose, we only need to end the use-chain checking with the input of
+// a reduce operations. In this case, the above description on "output" apply
+// to the result of such a use-chain, which provides the input to the reduce
+// operation.
+bool IsInstructionSafeForShmemTranspose(const HloInstruction* hlo) {
+  if (hlo->IsElementwise()) {
+    return absl::c_all_of(hlo->users(), [&](const HloInstruction* user) {
+      return IsInstructionSafeForShmemTranspose(user);
+    });
+  }
+
+  switch (hlo->opcode()) {
+    // Non-elementwise instructions that don't cause the shmem transpose
+    // to be unsafe, including the instructions that don't currently fuse.
+    case HloOpcode::kGetDimensionSize:
+      // The result of the operation doesn't rely on the content of the
+      // tensor. As such, there is no need to further inspect its users.
+      return true;
+    case HloOpcode::kGetTupleElement:
+    case HloOpcode::kMap:
+    case HloOpcode::kParameter:
+    case HloOpcode::kTuple:
+    case HloOpcode::kTupleSelect:
+      return absl::c_all_of(hlo->users(), [&](const HloInstruction* user) {
+        return IsInstructionSafeForShmemTranspose(user);
+      });
 
-  if (hlo->opcode() == HloOpcode::kFusion) {
-    return absl::c_all_of(hlo->fused_instructions_computation()->instructions(),
-                          is_safe_for_tile_based_transpose);
+    default:
+      return false;
   }
+}
 
-  return is_safe_for_tile_based_transpose(hlo);
+// Given a group of input parameters that are 0-2-1 tranpose of the outputs of
+// a fusion kernel, returns the input parameters that are safe for the shared
+// memory tranpose implementation.
+//
+// When a tile based shared memory transpose is used to implement an input with
+// 0-2-1 transpose, we preload a tile of the input elements
+// [z, y..y+31, x..x+31] to compute the output tile elements of the same
+// indices. Preloading the input tile this way is only safe when the computation
+// of the output tile elements do not need any input element outside the
+// preloaded tile. We inspect all the transitive users of the input parameter
+// up to the fusion root instruction to see if we can find any instruction
+// that can make preloading the input tile unsafe.
+std::vector<int64> FilterInputsForShmemTranspose(const HloInstruction* fusion,
+                                                 std::vector<int64> input_ids) {
+  std::vector<int64> filtered_input_ids;
+  for (int64 i = 0; i < input_ids.size(); ++i) {
+    const HloInstruction* input = fusion->fused_parameter(input_ids[i]);
+    if (IsInstructionSafeForShmemTranspose(input)) {
+      filtered_input_ids.push_back(input_ids[i]);
+    } else {
+      VLOG(10) << "Input not safe for shmem transpose " << input->ToString()
+               << "\n";
+    }
+  }
+  return filtered_input_ids;
 }
+
 }  // namespace
 
 bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
@@ -3106,8 +3256,11 @@ bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
     return false;
   }
 
-  if (!IsInstructionSafeForTileBasedTranspose(hlo)) {
-    return false;
+  if (opcode == HloOpcode::kFusion) {
+    params_012 = FilterInputsForShmemTranspose(hlo, params_012);
+    if (params_012.empty()) {
+      return false;
+    }
   }
 
   // Each of our shared memory tiles has 32*33 elements (so ~4kb, if the
@@ -3250,11 +3403,101 @@ std::tuple<int64, int64, int64> GetReductionToVectorDimensions(
   return std::make_tuple(num_reduced_major, num_kept, num_reduced_minor);
 }
 
+// Returns true if all the transitive users of hlo before hitting users in
+// use_chain_endings are elementwise operations.
+bool AreUsersElementwise(const HloInstruction* hlo,
+                         const ConstHloInstructionSet& use_chain_endings) {
+  return absl::c_all_of(hlo->users(), [&](const HloInstruction* user) {
+    return use_chain_endings.count(user) ||
+           (user->IsElementwise() &&
+            AreUsersElementwise(user, use_chain_endings));
+  });
+}
+
+// Returns the number of fusion inputs that have the same dimension as the
+// given shape, and involve in only elementwise operations.
+int64 NumInputsInvolveInOnlyElementwiseOps(
+    const HloInstruction* unnested_hlo, const Shape& op_shape,
+    const ConstHloInstructionSet& use_chain_endings) {
+  return absl::c_count_if(
+      unnested_hlo->fused_parameters(), [&](const HloInstruction* parameter) {
+        const Shape& parameter_shape = parameter->shape();
+        return ShapeUtil::SameDimensions(op_shape, parameter_shape) &&
+               AreUsersElementwise(parameter, use_chain_endings);
+      });
+}
+
+// Returns the number of fusion inputs that have more elements than the given
+// shape.
+int64 NumInputsWithMoreElementsThan(const HloInstruction* unnested_hlo,
+                                    const Shape& shape) {
+  int64 num_elements = ShapeUtil::ElementsIn(shape);
+  return absl::c_count_if(
+      unnested_hlo->fused_parameters(), [&](const HloInstruction* parameter) {
+        return ShapeUtil::ElementsIn(parameter->shape()) > num_elements;
+      });
+}
+
+// The benefit of unrolling a kInput fusion that is a column reduction comes
+// from the vectorization of non-reduction fusion outputs and fusion inputs.
+// On the other hand, unrolling can also introduce factors that can cause
+// the kernel to run slower. This routine uses a simple heuristic to estimate
+// the benefit as well as the overhead of unrolling in order to decide whether
+// unrolling is beneficial for the given kInput fusion.
+bool IsUnrollingColumnReductionBeneficial(const HloInstruction* unnested_hlo,
+                                          const Shape& input_shape,
+                                          int64 num_kept) {
+  // TODO(b/122468062): Need further investigate to see whether we can
+  // remove the constraint on IsPowerOfTwo.
+  if (!IsPowerOfTwo(static_cast<uint64>(num_kept))) {
+    return false;
+  }
+
+  if (unnested_hlo->opcode() == HloOpcode::kReduce) {
+    return true;
+  }
+
+  CHECK_EQ(unnested_hlo->opcode(), HloOpcode::kFusion);
+  int64 can_be_vectorized = 0;
+  int64 cannot_be_vectorized = 0;
+  const HloInstruction* fused_root = unnested_hlo->fused_expression_root();
+  ConstHloInstructionSet use_chain_endings;
+  if (fused_root->opcode() == HloOpcode::kReduce) {
+    use_chain_endings.insert(fused_root);
+    // Atomic.add of the reduction result can't be vectorized.
+    cannot_be_vectorized++;
+  } else {
+    CHECK_EQ(fused_root->opcode(), HloOpcode::kTuple);
+    for (const HloInstruction* instr : fused_root->operands()) {
+      if (instr->opcode() == HloOpcode::kReduce) {
+        // Atomic.add of the reduction result can't be vectorized.
+        cannot_be_vectorized++;
+      } else {
+        // Write of the non-reduction result can be vectorized.
+        can_be_vectorized++;
+      }
+      use_chain_endings.insert(instr);
+    }
+  }
+  // Fusion inputs that have the same dimension as the reduce input and
+  // only involve in elementwise operations can be vectorized.
+  can_be_vectorized += NumInputsInvolveInOnlyElementwiseOps(
+      unnested_hlo, input_shape, use_chain_endings);
+  // Fusion inputs with more elements than the reduce op input must participate
+  // in non-elementwise operations and we assume that they are not vectorizable
+  // for the purpose of estimating the benefit of unrolling. If the kernel is
+  // unrolled even with such an assumption,  and the accesses to those inputs
+  // turn out to be vectorizable, the compiler will still vectorize them.
+  cannot_be_vectorized +=
+      NumInputsWithMoreElementsThan(unnested_hlo, input_shape);
+  return can_be_vectorized >= cannot_be_vectorized;
+}
+
 }  // namespace
 
 std::tuple<KernelMappingScheme, bool>
 IrEmitterUnnested::ComputeMappingSchemeAndReductionKind(
-    const HloInstruction* first_reduce) {
+    const HloInstruction* unnested_hlo, const HloInstruction* first_reduce) {
   int64 depth = 1;
   int64 height = 1;
   int64 width = 1;
@@ -3271,6 +3514,7 @@ IrEmitterUnnested::ComputeMappingSchemeAndReductionKind(
   std::tie(num_reduced_major, num_kept, num_reduced_minor) =
       GetReductionToVectorDimensions(input_shape, first_reduce->dimensions());
   CHECK_EQ(num_output_elems, num_kept);
+  bool dilated_x = true;
 
   if (num_kept == 1) {
     // Scalar reduction is a special row reduction with depth = height = 1.
@@ -3285,13 +3529,21 @@ IrEmitterUnnested::ComputeMappingSchemeAndReductionKind(
     is_row_reduction = false;
     // Column reduction without transpose doesn't require communication among
     // threads processing elements in the same tile. The current implementation
-    // only support the use of on hardware thread block to process one block of
-    // tiles in the KernelMappingScheme. We try to maximize the values of
+    // only support the use of one hardware thread block to process one block of
+    // tiles in the KernelMappingScheme. We try to use one thread to compute
+    // the partial results for two tensor elements and to maximize the values of
     // num_threads_x and tile_size_x to allow a bigger hardware thread block.
     int64 hw_threads_per_block_limit =
         ThreadsPerBlockLimit(ir_emitter_context_->device_description());
-    tile_size_x = std::min(hw_threads_per_block_limit, num_kept);
-    num_threads_x = tile_size_x;
+    if (IsUnrollingColumnReductionBeneficial(unnested_hlo, input_shape,
+                                             num_kept)) {
+      tile_size_x = std::min(2 * hw_threads_per_block_limit, num_kept);
+      num_threads_x = tile_size_x / 2;
+      dilated_x = false;
+    } else {
+      tile_size_x = std::min(hw_threads_per_block_limit, num_kept);
+      num_threads_x = tile_size_x;
+    }
     int64 kNumElementsPerPartialSum = 128;
     tile_size_y = kNumElementsPerPartialSum;
   } else {
@@ -3320,6 +3572,7 @@ IrEmitterUnnested::ComputeMappingSchemeAndReductionKind(
   llvm_ir::KernelMappingScheme mapping_scheme(
       dims_in_elem, tile_size_y, tile_size_x, req_block_sizes, num_threads_y,
       num_threads_x, &b_);
+  mapping_scheme.SetDilatedX(dilated_x);
   return std::make_tuple(mapping_scheme, is_row_reduction);
 }
 
@@ -3368,14 +3621,15 @@ Status IrEmitterUnnested::EmitReductionToVector(HloInstruction* unnested_hlo) {
   bool is_row_reduction;
   llvm_ir::KernelMappingScheme mapping_scheme;
   std::tie(mapping_scheme, is_row_reduction) =
-      ComputeMappingSchemeAndReductionKind(first_reduce);
+      ComputeMappingSchemeAndReductionKind(unnested_hlo, first_reduce);
   ReductionCodegenInfo reduction_info(&mapping_scheme, is_row_reduction);
   KernelCodeGenerator kernel_generator(
       /*tile_element_generator=*/
       [&](HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
           const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
-          llvm::Value* x_loc) {
-        EmitTileElementForReduction(hlo, index, kernel_info, y_loc, x_loc);
+          llvm::Value* x_loc, int64 x_iter_num) {
+        EmitTileElementForReduction(hlo, index, kernel_info, y_loc, x_loc,
+                                    x_iter_num);
       },
       /*block_prologue_generator=*/
       [&](HloInstruction* hlo, KernelCodegenInfo* kernel_info) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index d217ee36cf6e9b5278024a2f78513232328e7538..21b842bb2cd63ac454f85556df20ae5877cecbe1 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -76,7 +76,6 @@ class IrEmitterUnnested : public IrEmitter {
     void SetLaneId(llvm::Value* v) { lane_id_ = v; }
     void SetIndexType(llvm::Type* t) { index_ty_ = t; }
     void SetTiledParamInfo(llvm_ir::TiledParameterInfo* tiled_param_info) {
-      CHECK_EQ(tiled_param_info_, nullptr);
       tiled_param_info_ = tiled_param_info;
     }
 
@@ -89,7 +88,7 @@ class IrEmitterUnnested : public IrEmitter {
     }
     llvm::Type* GetIndexType() const { return index_ty_; }
 
-   private:
+   protected:
     llvm_ir::KernelMappingScheme* mapping_scheme_;
     llvm_ir::TiledParameterInfo* tiled_param_info_;
     llvm::Value* lane_id_;
@@ -109,10 +108,12 @@ class IrEmitterUnnested : public IrEmitter {
   // y_loc: The y coordinate within a tile.
   // x_loc: The x coordinate within a tile.
   // kernel_info: Other information to support the kernel code generation.
+  // x_iter_num: When a thread process N elements in the X dimension, x_iter_num
+  //             has a value of 0..N-1 to identify the element being process.
   using TileElementGenerator = std::function<void(
       HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
       const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
-      llvm::Value* x_loc)>;
+      llvm::Value* x_loc, int64 x_iter_num)>;
 
   // KernelCodeGenerator records the code generator objects that generate code
   // for tile elements or tile block prologue/epilogue.
@@ -216,9 +217,13 @@ class IrEmitterUnnested : public IrEmitter {
   Status EmitReductionToVector(HloInstruction* unnested_hlo);
 
   // Computes the KernelMappingScheme for the reduce HLO and indicates whether
-  // the reduction is a row reduction.
+  // the reduction is a row reduction. For an un-fused reduce op, unnested_hlo
+  // and first_reduce are the same instruction. For a kInput fusion,
+  // unnested_hlo is the fusion instruction while first_reduce is the first
+  // reduce op.
   std::tuple<llvm_ir::KernelMappingScheme, bool>
-  ComputeMappingSchemeAndReductionKind(const HloInstruction* first_reduce);
+  ComputeMappingSchemeAndReductionKind(const HloInstruction* unnested_hlo,
+                                       const HloInstruction* first_reduce);
 
   // Emits code for an in-place scatter, modifying `thunk`s launch dimensions in
   // the process. `scatter` may be fused, scatter indices are taken from
@@ -243,26 +248,29 @@ class IrEmitterUnnested : public IrEmitter {
                               const KernelCodeGenerator& kernel_generator,
                               KernelCodegenInfo* kernel_info);
   void EmitBlock(const TileGenerator& emit_one_tile,
-                 const KernelCodegenInfo* kernel_info,
-                 KernelSupportLibrary& ksl, llvm::Type* index_ty);
+                 KernelCodegenInfo* kernel_info, KernelSupportLibrary* ksl,
+                 llvm::Type* index_ty);
   // Emits code to process a tensor element in a tile for the given kCopy HLO
   // that performs a 0-2-1 transpose.
   void EmitTileElementForCopy(HloInstruction* hlo,
                               const llvm_ir::IrArray::Index& index,
                               const KernelCodegenInfo* kernel_info,
-                              llvm::Value* y_loc, llvm::Value* x_loc);
+                              llvm::Value* y_loc, llvm::Value* x_loc,
+                              int64 x_iter_num);
   // Emits code to process a tensor element in a tile for the given kLoop fusion
   // HLO containing parameters that are 0-2-1 transpose of its outputs.
   void EmitTileElementForFusion(HloInstruction* hlo,
                                 const llvm_ir::IrArray::Index& index,
                                 const KernelCodegenInfo* kernel_info,
-                                llvm::Value* y_loc, llvm::Value* x_loc);
+                                llvm::Value* y_loc, llvm::Value* x_loc,
+                                int64 x_iter_num);
   // Emits code to process a tensor element in a tile for the given input hlo
   // that is either a unnested kReduce or a kInput fusion.
   void EmitTileElementForReduction(HloInstruction* unnested_hlo,
                                    const llvm_ir::IrArray::Index& index,
                                    const KernelCodegenInfo* kernel_info,
-                                   llvm::Value* y_loc, llvm::Value* x_loc);
+                                   llvm::Value* y_loc, llvm::Value* x_loc,
+                                   int64 x_iter_num);
   // Prepares for the code generation for a tile block of a reduction kernel.
   void EmitPrologueForReduction(HloInstruction* unnested_hlo,
                                 KernelCodegenInfo* kernel_info);
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
index bd53b90b42d8e657a3ee58e7ca03fb60522aae28..eddaa877f28cb7bd32e924cd179814581eb97b12 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
@@ -125,6 +125,7 @@ static string GetSmName(std::pair<int, int> compute_capability) {
       {{6, 2}, 62},
       {{7, 0}, 70},
       {{7, 2}, 72},
+      {{7, 5}, 75},
   });
   int sm_version = 30;
   auto it = m->find(compute_capability);
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
index 01fddcede64d1bb02ab89db5fc9524893c2d47a4..02e1207f377b8c28bf2566bee8cf3bcbc66794fb 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
@@ -67,7 +67,7 @@ int64 GpuMultiOutputFusion::GetProfit(HloInstruction* instr1,
   }
   int64 profit = 0;
   for (auto instr : instr2->operands()) {
-    if (!IsProfitableOperand(instr) || in_list.count(instr) == 0) {
+    if (!IsProfitableOperand(instr) || !in_list.contains(instr)) {
       continue;
     }
     profit += ShapeUtil::ByteSizeOf(instr->shape());
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
index d16c87ba5c63aa582753fe949e9e39ee2d8b81e5..40b87b16a195564c9b98497f79a70f1db0539d87 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
@@ -628,8 +628,7 @@ TEST_F(MultiOutputFusionTest, MultiOutputFusionDUS) {
       p.1 = s32[1]{0} parameter(1)
       p.2 = f16[1,96,1024]{2,1,0} parameter(2)
       c.0 = s32[] constant(0)
-      pad = s32[3]{0} pad(p.1, c.0), padding=0_2
-      ROOT %dynamic-update-slice = f16[50,96,1024]{2,1,0} dynamic-update-slice(p.0, p.2, pad)
+      ROOT %dynamic-update-slice = f16[50,96,1024]{2,1,0} dynamic-update-slice(p.0, p.2, p.1, c.0, c.0)
     }
 
     fusion.2 {
@@ -638,7 +637,7 @@ TEST_F(MultiOutputFusionTest, MultiOutputFusionDUS) {
       p.2 = f16[1,96,1024]{2,1,0} parameter(2)
       c.0 = s32[] constant(0)
       pad = s32[3]{0} pad(p.1, c.0), padding=0_2
-      ROOT %dynamic-update-slice = f16[50,96,1024]{2,1,0} dynamic-update-slice(p.0, p.2, pad)
+      ROOT %dynamic-update-slice = f16[50,96,1024]{2,1,0} dynamic-update-slice(p.0, p.2, p.1, c.0, c.0)
     }
 
     ENTRY entry {
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index cd369d55987b96eed2efb64ae0df6b3a76acb672..d1522280baf9a153d0731abfbc5a683df6b1cc53 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -37,6 +37,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/conditional_simplifier.h"
 #include "tensorflow/compiler/xla/service/convolution_group_converter.h"
+#include "tensorflow/compiler/xla/service/dot_decomposer.h"
+#include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h"
@@ -152,6 +154,7 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
     HloPassPipeline pipeline("optimization");
     pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
                                               /*allow_mixed_precision=*/false);
+    pipeline.AddPass<DynamicIndexSplitter>();
     pipeline.AddPass<GpuHloSupportChecker>();
     ReducePrecisionInsertion::AddPasses(
         &pipeline, hlo_module->config().debug_options(),
@@ -163,6 +166,7 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
       // We need a cost model for GPUs. Currently, do nothing.
       return false;
     };
+    pipeline.AddPass<DotDecomposer>(false);
     pipeline.AddPass<ConvolutionGroupConverter>(
         cost_model,
         /*convert_batch_groups_only=*/true);
diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
index 4775baf44aecfe6adaf2bf0d2791595436635b16..1dedbd3befce6e2ceb06126d83a061207a90dd8f 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
@@ -25,7 +26,7 @@ namespace xla {
 namespace gpu {
 
 bool StreamAssignment::HasStreamAssigned(const HloInstruction& hlo) const {
-  return hlo_to_stream_number_.count(&hlo);
+  return hlo_to_stream_number_.contains(&hlo);
 }
 
 int StreamAssignment::StreamNumberForHlo(const HloInstruction& hlo) const {
@@ -98,10 +99,10 @@ int ComputeStreamToAssign(
   // greedy approach. First, we compute as forbidden_stream_numbers the
   // streams assigned to GEMMs that are concurrent with `hlo`. Then, we assign
   // `hlo` a different stream.
-  std::set<int> forbidden_stream_numbers;
+  absl::flat_hash_set<int> forbidden_stream_numbers;
   for (const auto* seen_gemm : seen_gemms) {
     int stream_num = stream_assignment.StreamNumberForHlo(*seen_gemm);
-    if (!forbidden_stream_numbers.count(stream_num) &&
+    if (!forbidden_stream_numbers.contains(stream_num) &&
         CanRunConcurrently(*seen_gemm, hlo, reachability)) {
       forbidden_stream_numbers.insert(stream_num);
     }
@@ -109,7 +110,7 @@ int ComputeStreamToAssign(
 
   for (int stream_num = 0; stream_num < stream_assignment.StreamCount();
        ++stream_num) {
-    if (!forbidden_stream_numbers.count(stream_num)) {
+    if (!forbidden_stream_numbers.contains(stream_num)) {
       return stream_num;
     }
   }
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
index a302b582ede3723acd118d2e4a4bb3efdf7a4d0b..869724db601b2d5e4ed6d3c7bf3e10a748433146 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@@ -65,7 +65,7 @@ TEST_F(GpuKernelTilingTest, UnnestedTransposeWithProperDimensionsTiled) {
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
 ; CHECK-LABEL: define void @copy
-; CHECK: tail call void @llvm.nvvm.barrier0()
+; CHECK: call void @llvm.nvvm.barrier0()
 ; CHECK: }
 )",
                      /*match_optimized_ir=*/true);
@@ -91,7 +91,7 @@ TEST_F(GpuKernelTilingTest, UnnestedTransposeWithSmallDimensionsNotTiled) {
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
 ; CHECK-LABEL: define void @copy
-; CHECK-NOT: tail call void @llvm.nvvm.barrier0()
+; CHECK-NOT: call void @llvm.nvvm.barrier0()
 ; CHECK: }
 )",
                      /*match_optimized_ir=*/true);
@@ -118,7 +118,7 @@ TEST_F(GpuKernelTilingTest, SimpleFusionWithTransposeTiled) {
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
 ; CHECK-LABEL: define void @fusion
-; CHECK: tail call void @llvm.nvvm.barrier0()
+; CHECK: call void @llvm.nvvm.barrier0()
 ; CHECK: }
 )",
                      /*match_optimized_ir=*/true);
@@ -152,7 +152,7 @@ TEST_F(GpuKernelTilingTest, MultipleOutputFusionWithOnePossibleTransposeTiled) {
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
 ; CHECK-LABEL: define void @fusion
-; CHECK: tail call void @llvm.nvvm.barrier0()
+; CHECK: call void @llvm.nvvm.barrier0()
 ; CHECK: }
 )",
                      /*match_optimized_ir=*/true);
@@ -187,13 +187,13 @@ TEST_F(GpuKernelTilingTest,
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
 ; CHECK-LABEL: define void @fusion
-; CHECK-NOT: tail call void @llvm.nvvm.barrier0()
+; CHECK-NOT: call void @llvm.nvvm.barrier0()
 ; CHECK: }
 )",
                      /*match_optimized_ir=*/true);
 }
 
-TEST_F(GpuKernelTilingTest, FusionTransposeWithReverseNotTiled) {
+TEST_F(GpuKernelTilingTest, TransposedInputWithUserReverseNotTiled) {
   const char *const kHloString = R"(
     HloModule FusionTransposeWithReverseNotTiled
     fused_computation.1 {
@@ -214,12 +214,203 @@ TEST_F(GpuKernelTilingTest, FusionTransposeWithReverseNotTiled) {
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
 ; CHECK-LABEL: define void @fusion
-; CHECK-NOT: tail call void @llvm.nvvm.barrier0()
+; CHECK-NOT: call void @llvm.nvvm.barrier0()
 ; CHECK: }
 )",
                      /*match_optimized_ir=*/true);
 }
 
+TEST_F(GpuKernelTilingTest, TransposedInputWithUserBitcastNotTiled) {
+  const char *const kHloString = R"(
+    HloModule TransposedInputWithUserBitcast
+
+    fused_computation {
+      param_0 = f32[20,20]{1,0} parameter(0)
+      ROOT bitcast = f32[20,20]{0,1} bitcast(param_0)
+    }
+
+    ENTRY kernel_entry {
+      parameter.0 = f32[20,20]{1,0} parameter(0)
+      ROOT fusion = f32[20,20]{0,1} fusion(parameter.0),
+        kind=kLoop, calls=fused_computation
+    })";
+
+  // Check that a call to llvm.nvvm.barrier0 is not generated.
+  auto hlo_module =
+      ParseHloString(kHloString, ConfigWithoutLayoutAssignment()).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @fusion
+; CHECK-NOT: call void @llvm.nvvm.barrier0()
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+
+  // Check that the kernel runs correctly.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0}));
+}
+
+TEST_F(GpuKernelTilingTest, TransposedInputWithoutUnsafeUseTiled) {
+  const char *const kHloString = R"(
+    HloModule TwoTransposedInputs
+
+    fused_computation {
+      param_0 = f32[64,64]{1,0} parameter(0)
+      param_1 = f32[64,64]{1,0} parameter(1)
+      bitcast = f32[64,64]{0,1} bitcast(param_0)
+      copy = f32[64,64]{0,1} copy(param_1)
+      ROOT tuple = (f32[64,64]{0,1}, f32[64,64]{0,1}) tuple(bitcast, copy)
+    }
+
+    ENTRY kernel_entry {
+      parameter.0 = f32[64,64]{1,0} parameter(0)
+      parameter.1 = f32[64,64]{1,0} parameter(1)
+      ROOT fusion = (f32[64,64]{0,1}, f32[64,64]{0,1})
+        fusion(parameter.0, parameter.1),
+        kind=kLoop, calls=fused_computation
+    })";
+
+  // Check that a call to llvm.nvvm.barrier0 is generated.
+  auto hlo_module =
+      ParseHloString(kHloString, ConfigWithoutLayoutAssignment()).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @fusion
+; CHECK: call void @llvm.nvvm.barrier0()
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+  // Check that the kernel runs correctly.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0}));
+}
+
+TEST_F(GpuKernelTilingTest, ColumnReductionWithPowerOf2OutputElementsUnrolled) {
+  const char *const kHloString = R"(
+  HloModule column_reduce_powerof2
+
+  reduction {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT add = f32[] add(x, y)
+  }
+
+  ENTRY kernel_entry {
+    constant0 = f32[] constant(0)
+    arg1 = f16[1024,512]{1,0} parameter(0)
+    arg1_conv = f32[1024,512]{1,0} convert(arg1)
+    ROOT reduce = f32[512]{0} reduce(arg1_conv, constant0), dimensions={0}, to_apply=reduction
+  })";
+
+  // Check that two calls to llvm.nvvm.atomic are generated.
+  auto hlo_module =
+      ParseHloString(kHloString, ConfigWithoutLayoutAssignment()).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @fusion
+; CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK-NOT: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+  // Check that the kernel runs correctly.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
+}
+
+TEST_F(GpuKernelTilingTest,
+       ColumnReductionWithInputLargerThenReduceInputNotUnrolled) {
+  const char *const kHloString = R"(
+  HloModule larger_than_reduce_input_parameter
+
+  reduction22 {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT add = f32[] add(x, y)
+  }
+
+  fused_computation {
+    constant0 = f32[] constant(0)
+    arg.1 = f16[1024,512]{1,0} parameter(0)
+    arg.2 = f16[1027,513]{1,0} parameter(1)
+    arg1.conv = f32[1024,512]{1,0} convert(arg.1)
+    arg2.conv = f32[1027,513]{1,0} convert(arg.2)
+    slice2 = f32[1024,512]{1,0} slice(arg2.conv), slice={[2:1026], [1:513]}
+    add2 = f32[1024,512]{1,0} add(arg1.conv, slice2)
+    ROOT reduce = f32[512]{0} reduce(add2, constant0), dimensions={0},
+      to_apply=reduction22
+  }
+
+  ENTRY kernel_entry {
+    arg1 = f16[1024,512]{1,0} parameter(0)
+    arg2 = f16[1027,513]{1,0} parameter(1)
+    ROOT fusion = f32[512]{0} fusion(arg1, arg2), kind=kInput,
+      calls=fused_computation
+  })";
+
+  // Check that one call to llvm.nvvm.atomic is generated.
+  auto hlo_module =
+      ParseHloString(kHloString, ConfigWithoutLayoutAssignment()).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @fusion
+; CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK-NOT: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+  // Check that the kernel runs correctly.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
+}
+
+TEST_F(GpuKernelTilingTest, ColumnReductionMOFUnrolled) {
+  const char *const kHloString = R"(
+  HloModule column_reduce_powerof2_mof
+
+  reduction22 {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT add = f32[] add(x, y)
+  }
+
+  fused_computation {
+    constant0 = f32[] constant(0)
+    arg.1 = f16[1024,512]{1,0} parameter(0)
+    arg.2 = f16[1024,512]{1,0} parameter(1)
+    arg1.conv = f32[1024,512]{1,0} convert(arg.1)
+    arg2.conv = f32[1024,512]{1,0} convert(arg.2)
+    reduce1 = f32[512]{0} reduce(arg1.conv, constant0), dimensions={0},
+      to_apply=reduction22
+    reduce2 = f32[512]{0} reduce(arg2.conv, constant0), dimensions={0},
+      to_apply=reduction22
+    add = f32[1024,512]{1,0} add(arg1.conv, arg2.conv)
+    ROOT tuple = (f32[512]{0}, f32[512]{0}, f32[1024,512]{1,0})
+      tuple(reduce1, reduce2, add)
+  }
+
+  ENTRY kernel_entry {
+    arg1 = f16[1024,512]{1,0} parameter(0)
+    arg2 = f16[1024,512]{1,0} parameter(1)
+    ROOT fusion = (f32[512]{0}, f32[512]{0}, f32[1024,512]{1,0})
+      fusion(arg1, arg2), kind=kInput, calls=fused_computation
+  })";
+
+  // Check that four calls to llvm.nvvm.atomic are generated.
+  auto hlo_module =
+      ParseHloString(kHloString, ConfigWithoutLayoutAssignment()).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @fusion
+; CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK-NOT: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+  // Check that the kernel runs correctly.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
+}
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
index 6b2d76764a077dc6cfa3f9ddc6e525ab330323be..25bad67bab9375559c431466571c62acd0452b01 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
+++ b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
@@ -14,17 +14,19 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/gpu/thunk_schedule.h"
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 
 namespace xla {
 namespace gpu {
 
 void ThunkSchedule::AddDependenciesOnTransitiveOperands(
     const Thunk& thunk, const HloInstruction& operand,
-    const std::unordered_map<const HloInstruction*, Thunk*>& hlo_to_thunk) {
-  if (hlo_to_thunk.count(&operand)) {
+    const absl::flat_hash_map<const HloInstruction*, Thunk*>& hlo_to_thunk) {
+  if (hlo_to_thunk.contains(&operand)) {
     // If `operand` is mapped to a thunk, adds `operand` to `thunk`'s dependency
     // list if `operand` is assigned to a different stream. As an optimization,
     // we skip `operand`'s operands because `operand` depends on them already.
@@ -48,14 +50,14 @@ ThunkSchedule::ThunkSchedule(
     const std::vector<HloInstruction*>& hlo_total_order)
     : thunks_(std::move(thunks)),
       stream_assignment_(std::move(stream_assignment)) {
-  std::unordered_map<const HloInstruction*, Thunk*> hlo_to_thunk;
+  absl::flat_hash_map<const HloInstruction*, Thunk*> hlo_to_thunk;
   for (const auto& thunk : *thunks_) {
     InsertOrDie(&hlo_to_thunk, thunk->hlo_instruction(), thunk.get());
   }
 
   for (HloInstruction* hlo : hlo_total_order) {
-    if (hlo_to_thunk.count(hlo)) {
-      thunk_total_order_.push_back(FindOrDie(hlo_to_thunk, hlo));
+    if (Thunk** thunk = tensorflow::gtl::FindOrNull(hlo_to_thunk, hlo)) {
+      thunk_total_order_.push_back(*thunk);
     }
   }
 
@@ -106,7 +108,7 @@ void ThunkSchedule::RemoveRedundantDependencyEdges() {
   // redundant dependency edge.
   Array2D<int> last_dependency(stream_count, stream_count, -1);
   for (const Thunk* dst : thunk_total_order_) {
-    if (!depends_on_.count(dst)) {
+    if (!depends_on_.contains(dst)) {
       continue;
     }
 
@@ -134,7 +136,7 @@ void ThunkSchedule::RemoveRedundantDependencyEdges() {
 
 const std::list<const Thunk*>& ThunkSchedule::DependsOn(
     const Thunk* thunk) const {
-  if (depends_on_.count(thunk)) {
+  if (depends_on_.contains(thunk)) {
     return FindOrDie(depends_on_, thunk);
   } else {
     return empty_thunk_list_;
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_schedule.h b/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
index 43b628a1baf0e79a3197f3cfad3547991642eaed..549378debd52417252724a5d8a6f4d24f2ad0369 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
@@ -21,6 +21,8 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -54,7 +56,9 @@ class ThunkSchedule {
   // Thunks that `thunk` depends on.
   const std::list<const Thunk*>& DependsOn(const Thunk* thunk) const;
   // Whether `thunk` is depended by another thunk.
-  bool Depended(const Thunk* thunk) const { return depended_by_.count(thunk); }
+  bool Depended(const Thunk* thunk) const {
+    return depended_by_.contains(thunk);
+  }
 
   // Delegates to StreamAssignment.
   int StreamCount() const { return stream_assignment_->StreamCount(); }
@@ -75,13 +79,13 @@ class ThunkSchedule {
   // thunk.hlo_instruction().
   void AddDependenciesOnTransitiveOperands(
       const Thunk& thunk, const HloInstruction& operand,
-      const std::unordered_map<const HloInstruction*, Thunk*>& hlo_to_thunk);
+      const absl::flat_hash_map<const HloInstruction*, Thunk*>& hlo_to_thunk);
 
   std::unique_ptr<ThunkSequence> thunks_;
   std::vector<Thunk*> thunk_total_order_;
 
-  std::unordered_map<const Thunk*, std::list<const Thunk*>> depends_on_;
-  std::set<const Thunk*> depended_by_;
+  absl::flat_hash_map<const Thunk*, std::list<const Thunk*>> depends_on_;
+  absl::flat_hash_set<const Thunk*> depended_by_;
   std::list<const Thunk*> empty_thunk_list_;
 
   std::unique_ptr<StreamAssignment> stream_assignment_;
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 9220865867b770eebfb1ada8f31a5d24693a4b8d..4fca981c6a59cdb91a997e6a887fd26472c1a10a 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -199,7 +199,7 @@ Status HeapSimulator::RunComputation(
 
       // If the buffer has no users and isn't an entry parameter or output, it
       // must be a dead value.
-      if (live_buffers.count(buffer) == 0) {
+      if (!live_buffers.contains(buffer)) {
         dead_buffers_to_free.push_back(buffer);
       }
     }
@@ -225,10 +225,10 @@ Status HeapSimulator::RunComputation(
       }
     }
     // Sort to get a deterministic iteration order.
-    std::sort(operand_buffers_to_free.begin(), operand_buffers_to_free.end(),
-              [](const BufferValue* x, const BufferValue* y) {
-                return x->id() < y->id();
-              });
+    absl::c_sort(operand_buffers_to_free,
+                 [](const BufferValue* x, const BufferValue* y) {
+                   return x->id() < y->id();
+                 });
 
     // Allocate buffers defined by this instruction.  This is the latest point
     // that we can allocate; right before the buffer is first used.  This must
@@ -253,7 +253,7 @@ Status HeapSimulator::RunComputation(
       bool shared = false;
       if (options_.may_reuse_operand_buffers) {
         for (const BufferValue* operand_buffer : operand_buffers_to_free) {
-          if (reused_buffers.count(operand_buffer) != 0) {
+          if (reused_buffers.contains(operand_buffer)) {
             continue;
           }
           if (buffer->instruction()->IsUserOf(operand_buffer->instruction()) &&
@@ -335,10 +335,9 @@ Status HeapSimulator::RunComputation(
     to_free.push_back(buffer);
   }
 
-  std::sort(to_free.begin(), to_free.end(),
-            [](const BufferValue* x, const BufferValue* y) {
-              return x->id() < y->id();
-            });
+  absl::c_sort(to_free, [](const BufferValue* x, const BufferValue* y) {
+    return x->id() < y->id();
+  });
   for (const BufferValue* buffer : to_free) {
     VLOG(3) << "Freeing pending: " << buffer->ToString();
     Free(buffer, root);
@@ -374,15 +373,15 @@ bool HeapSimulator::IgnoreBuffer(const BufferValue* buffer) const {
     return true;
   }
   return options_.buffers_to_assign != nullptr &&
-         options_.buffers_to_assign->count(buffer) == 0;
+         !options_.buffers_to_assign->contains(buffer);
 }
 
 // Alloc always calls the underlying heap algorithm.
 void HeapSimulator::Alloc(const BufferValue* buffer,
                           const HloInstruction* instruction) {
-  CHECK(allocated_buffers_.count(buffer) == 0)
+  CHECK(!allocated_buffers_.contains(buffer))
       << "Alloc called on allocated buffer: " << *buffer;
-  CHECK(freed_buffers_.count(buffer) == 0)
+  CHECK(!freed_buffers_.contains(buffer))
       << "Alloc called on freed buffer: " << *buffer;
 
   allocated_buffers_.insert(buffer);
@@ -411,9 +410,9 @@ void HeapSimulator::Free(const BufferValue* buffer,
     buffer = group->canonical;
   }
 
-  CHECK(allocated_buffers_.count(buffer) > 0)
+  CHECK(allocated_buffers_.contains(buffer))
       << "Free called on non-allocated buffer: " << *buffer;
-  CHECK(freed_buffers_.count(buffer) == 0)
+  CHECK(!freed_buffers_.contains(buffer))
       << "Free called on freed buffer: " << *buffer;
 
   freed_buffers_.insert(buffer);
@@ -433,11 +432,11 @@ void HeapSimulator::ShareBuffer(const BufferValue* buffer,
                                 const HloInstruction* instruction) {
   CHECK_LE(size_fn_(*buffer), size_fn_(*shared))
       << "ShareBuffer oversized buffer" << *buffer << " shared: " << *shared;
-  CHECK(allocated_buffers_.count(buffer) == 0)
+  CHECK(!allocated_buffers_.contains(buffer))
       << "ShareBuffer called on allocated buffer: " << *buffer;
-  CHECK(freed_buffers_.count(buffer) == 0)
+  CHECK(!freed_buffers_.contains(buffer))
       << "ShareBuffer called on freed buffer: " << *buffer;
-  CHECK(freed_buffers_.count(shared) == 0)
+  CHECK(!freed_buffers_.contains(shared))
       << "ShareBuffer called on freed shared buffer: " << *shared;
 
   const BufferValue* canonical = nullptr;
@@ -452,7 +451,7 @@ void HeapSimulator::ShareBuffer(const BufferValue* buffer,
   } else {
     // The 'shared' buffer doesn't have a group; it must be the canonical.  Add
     // both 'buffer' and 'shared' to a new group.
-    CHECK(allocated_buffers_.count(shared) > 0)
+    CHECK(allocated_buffers_.contains(shared))
         << "ShareBuffer called on non-allocated shared buffer: " << *shared;
     auto group = std::make_shared<SharedGroup>();
     canonical = shared;
@@ -596,7 +595,7 @@ void DecreasingSizeRunsHeap::CallAndDrainRun() {
   }
 
   // Call ops in the run sorted by decreasing size, breaking ties by buffer id.
-  std::sort(run_.begin(), run_.end(), [](const Op& a, const Op& b) {
+  absl::c_sort(run_, [](const Op& a, const Op& b) {
     if (a.size != b.size) {
       return a.size > b.size;
     }
@@ -866,23 +865,23 @@ HeapSimulator::Result GlobalDecreasingSizeBestFitHeap::Finish() {
   for (auto& entry : buffer_intervals_) {
     sorted_buffer_intervals.push_back(entry.second);
   }
-  std::sort(sorted_buffer_intervals.begin(), sorted_buffer_intervals.end(),
-            [](const BufferInterval& x, const BufferInterval& y) {
-              if (x.size != y.size) {
-                return x.size > y.size;
-              }
-              if (x.end - x.start != y.end - y.start) {
-                return x.end - x.start > y.end - y.start;
-              }
-              return x.buffer->id() < y.buffer->id();
-            });
+  absl::c_sort(sorted_buffer_intervals,
+               [](const BufferInterval& x, const BufferInterval& y) {
+                 if (x.size != y.size) {
+                   return x.size > y.size;
+                 }
+                 if (x.end - x.start != y.end - y.start) {
+                   return x.end - x.start > y.end - y.start;
+                 }
+                 return x.buffer->id() < y.buffer->id();
+               });
 
   BufferIntervalTree interval_tree(sorted_buffer_intervals.size());
   for (auto& buffer_interval : sorted_buffer_intervals) {
     auto chunks_overlapping_in_time = interval_tree.ChunksOverlappingInTime(
         buffer_interval.start, buffer_interval.end);
-    std::sort(
-        chunks_overlapping_in_time.begin(), chunks_overlapping_in_time.end(),
+    absl::c_sort(
+        chunks_overlapping_in_time,
         [](const Chunk& x, const Chunk& y) { return x.offset < y.offset; });
 
     // Find the minimum free chunk that can hold this buffer.
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 9b50f1ca5b5365463f32106fc005ef2c63f2e37a..263b42a29dbb0dbc0fb6eca7968674ff242f45ed 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -229,6 +229,18 @@ message HloScheduleProto {
 }
 
 message HloInputOutputAliasProto {
+  enum Kind {
+    // Define a UNDEFINED_ALIAS equal to zero to get around the default-0 proto3
+    // behavior and missing has_*() APIs.
+    UNDEFINED_ALIAS = 0;
+    // An alias setup by the user as must alias. A use setting USER_ALIAS is
+    // expecting the designed output to be dropped over the given input
+    // parameter number+index.
+    USER_ALIAS = 1;
+    // An alias setup by the compiler as part of its optimizations.
+    SYSTEM_ALIAS = 2;
+  }
+
   // The following proto describes a pair of aliased an input
   // (described by parameter number and a ShapeIndex of the parameter)
   // and an output (described by a ShapeIndex of the root
@@ -249,6 +261,8 @@ message HloInputOutputAliasProto {
     int64 parameter_number = 2;
     // ShapeIndex of the parameter instruction.
     repeated int64 parameter_shape_index = 3;
+    // The kind of alias to be setup.
+    Kind kind = 4;
   }
 
   repeated AliasEntryProto entries = 1;
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
index 68094d8907f30202533672376b2199e7b77dc806..e511f1951c5dd07ebb64fa38fd5b7f6a0e87b429 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
@@ -117,7 +117,7 @@ class BufferValueMap {
     for (const auto& pair : buffers_) {
       buffer_numbers.push_back(pair.first);
     }
-    std::sort(buffer_numbers.begin(), buffer_numbers.end());
+    absl::c_sort(buffer_numbers);
     return buffer_numbers;
   }
 
@@ -176,13 +176,12 @@ class BufferValueMap {
       const HloValue& value, std::vector<BufferNumber>* aliased_buffers) {
     // Get parameter value from an aliased_input object.
     const auto get_parameter_value =
-        [this](const std::pair<int64, ShapeIndex>& aliased_input)
+        [this](const HloInputOutputAliasConfig::Alias& aliased_input)
         -> const HloValue& {
-      int64 param_number = aliased_input.first;
-      const ShapeIndex& param_index = aliased_input.second;
       return dataflow_.GetUniqueValueAt(
-          module_->entry_computation()->parameter_instruction(param_number),
-          param_index);
+          module_->entry_computation()->parameter_instruction(
+              aliased_input.parameter_number),
+          aliased_input.parameter_index);
     };
 
     // If the value shows up in a root instruction, alias it with parameter
@@ -319,7 +318,7 @@ class BufferValueMap {
     ComputeWhileAliasedBuffers(value, &aliased_buffers);
     ComputeConditionalAliasedBuffers(value, &aliased_buffers);
     // Uniquify aliased buffers.
-    std::sort(aliased_buffers.begin(), aliased_buffers.end());
+    absl::c_sort(aliased_buffers);
     aliased_buffers.erase(
         std::unique(aliased_buffers.begin(), aliased_buffers.end()),
         aliased_buffers.end());
@@ -367,7 +366,7 @@ std::vector<const HloBuffer*> HloAliasAnalysis::ComputeBuffersAt(
   }
 
   // Sort and uniquify vector before returning.
-  std::sort(buffers.begin(), buffers.end(), HloBuffer::IdLessThan);
+  absl::c_sort(buffers, HloBuffer::IdLessThan);
   buffers.erase(std::unique(buffers.begin(), buffers.end()), buffers.end());
 
   return buffers;
@@ -430,8 +429,7 @@ Status HloAliasAnalysis::Verify() const {
   for (const auto& pair : value_to_buffer_) {
     const HloValue* value = pair.first;
     const HloBuffer& buffer = *pair.second;
-    TF_RET_CHECK(std::find(buffer.values().begin(), buffer.values().end(),
-                           value) != buffer.values().end());
+    TF_RET_CHECK(absl::c_linear_search(buffer.values(), value));
   }
 
   for (HloBuffer::Id id = 0; id < buffers_.size(); ++id) {
@@ -515,7 +513,7 @@ StatusOr<std::unique_ptr<HloAliasAnalysis>> HloAliasAnalysis::Run(
     auto& value_set = buffer_map.GetValuesInBuffer(buffer_number);
     std::vector<const HloValue*> sorted_values(value_set.begin(),
                                                value_set.end());
-    std::sort(sorted_values.begin(), sorted_values.end(), HloValue::IdLessThan);
+    absl::c_sort(sorted_values, HloValue::IdLessThan);
     alias_analysis->buffers_.emplace_back(next_id++, sorted_values);
     for (const HloValue* value : sorted_values) {
       alias_analysis->value_to_buffer_[value] =
@@ -547,16 +545,15 @@ bool HloAliasAnalysis::HasLiveRangeInterference(
     // tie-break using value ID. The tie-break is necessary because we need a
     // strict weak order for std::sort.
     std::vector<const HloValue*> values = buffer.values();
-    std::sort(values.begin(), values.end(),
-              [&ordering](const HloValue* a, const HloValue* b) {
-                if (ordering.IsDefinedBefore(*a, *b)) {
-                  return true;
-                } else if (ordering.IsDefinedBefore(*b, *a)) {
-                  return false;
-                } else {
-                  return a->id() < b->id();
-                }
-              });
+    absl::c_sort(values, [&ordering](const HloValue* a, const HloValue* b) {
+      if (ordering.IsDefinedBefore(*a, *b)) {
+        return true;
+      } else if (ordering.IsDefinedBefore(*b, *a)) {
+        return false;
+      } else {
+        return a->id() < b->id();
+      }
+    });
 
     // Walk through the ordered vector of values. First verify that the values
     // are totally ordered with respect to 'ordering', then check that no
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
index 7e6150e94153cd15463725e862ce1b8593f2c991..b6dbf07959c541bceaa8eda5a0101503970ee832 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
@@ -238,13 +238,16 @@ TEST_F(HloAliasAnalysisTest, ParametersWithAliasing) {
       builder.AddInstruction(HloInstruction::CreateTuple({negate0, negate1}));
   module_->AddEntryComputation(builder.Build());
   TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1}));
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   // Cannot alias an output twice.
   ASSERT_IS_NOT_OK(module_->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -279,13 +282,16 @@ TEST_F(HloAliasAnalysisTest, ParametersWithCrossAliasing) {
       builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
   module_->AddEntryComputation(builder.Build());
   TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{1}));
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{1},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   // Cannot alias an output twice.
   ASSERT_IS_NOT_OK(module_->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1}));
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -365,9 +371,11 @@ TEST_F(HloAliasAnalysisTest, InputOutputAliasingWithWhile) {
       builder.AddInstruction(HloInstruction::CreateTuple({negate_1, negate_2}));
   module_->AddEntryComputation(builder.Build());
   TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1}));
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
diff --git a/tensorflow/compiler/xla/service/hlo_buffer.cc b/tensorflow/compiler/xla/service/hlo_buffer.cc
index 9c3aa0e64d119c2560f4955d0bcb492519fa52a2..32e48651b30bace4723169935d1f10dd7d7bfec3 100644
--- a/tensorflow/compiler/xla/service/hlo_buffer.cc
+++ b/tensorflow/compiler/xla/service/hlo_buffer.cc
@@ -49,7 +49,7 @@ std::vector<HloPosition> HloBuffer::ComputePositions() const {
                      value->positions().end());
   }
   // Remove duplicates and sort positions.
-  std::sort(positions.begin(), positions.end());
+  absl::c_sort(positions);
   positions.erase(std::unique(positions.begin(), positions.end()),
                   positions.end());
   return positions;
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 52ca67afb8eb270c490566ed51514b0b0f499b42..f9b64d12ae83139efa21ca67e565908bd78f9780 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -207,14 +207,14 @@ Status HloComputation::RemoveInstructionAndUnusedOperands(
   TF_RET_CHECK(instruction->user_count() == 0);
   TF_RET_CHECK(IsRemovable(instruction))
       << "Cannot remove instruction: " << instruction->ToString();
-  std::unordered_set<HloInstruction*> removed;
+  absl::flat_hash_set<HloInstruction*> removed;
   std::queue<HloInstruction*> worklist;
   worklist.push(instruction);
   while (!worklist.empty()) {
     HloInstruction* item = worklist.front();
     worklist.pop();
 
-    if (removed.count(item) != 0 || item->user_count() != 0 ||
+    if (removed.contains(item) || item->user_count() != 0 ||
         item == root_instruction() || !IsRemovable(item) ||
         (item->HasSideEffect() && item != instruction)) {
       continue;
@@ -531,11 +531,10 @@ HloComputation::CreateFromProto(
   HloInstruction* root = instruction_map.at(proto.root_id());
 
   // Sort the instructions in the proto id's order.
-  std::sort(instructions.begin(), instructions.end(),
-            [&](const std::unique_ptr<HloInstruction>& a,
-                const std::unique_ptr<HloInstruction>& b) {
-              return to_proto_id[a.get()] < to_proto_id[b.get()];
-            });
+  absl::c_sort(instructions, [&](const std::unique_ptr<HloInstruction>& a,
+                                 const std::unique_ptr<HloInstruction>& b) {
+    return to_proto_id[a.get()] < to_proto_id[b.get()];
+  });
 
   TF_RETURN_IF_ERROR([&]() -> Status {
     std::vector<bool> parameters_seen(parameter_count);
@@ -694,13 +693,14 @@ bool HloComputation::operator==(const HloComputation& other) const {
   if (this == &other) {
     return true;
   }
-  std::set<std::pair<const HloInstruction*, const HloInstruction*>> visited;
+  absl::flat_hash_set<std::pair<const HloInstruction*, const HloInstruction*>>
+      visited;
   std::function<bool(const HloInstruction*, const HloInstruction*)> eq =
       [&visited, &eq](const HloInstruction* a, const HloInstruction* b) {
         // If <a,b> are visited but not identical, the recursion should have
         // been aborted. So, if <a,b> are visited at this point, they must be
         // identical.
-        if (visited.count(std::make_pair(a, b)) > 0) {
+        if (visited.contains(std::make_pair(a, b))) {
           return true;
         }
         visited.emplace(a, b);
@@ -799,17 +799,16 @@ Status HloComputation::AcceptOrdered(
     absl::Span<HloInstruction* const> order) const {
   VLOG(3) << "Accepting visitor with order.";
   for (HloInstruction* root : CollectUnreachableRoots()) {
-    TF_RET_CHECK(std::find(order.begin(), order.end(), root) != order.end())
-        << root->ToString();
+    TF_RET_CHECK(absl::c_linear_search(order, root)) << root->ToString();
   }
   TF_RET_CHECK(order.size() == instruction_count());
-  std::unordered_set<const HloInstruction*> visited;
+  absl::flat_hash_set<const HloInstruction*> visited;
   for (const HloInstruction* instruction : order) {
     VLOG(3) << "Visiting ordered: " << instruction->ToString();
-    TF_RET_CHECK(instruction_iterators_.count(instruction) == 1)
+    TF_RET_CHECK(instruction_iterators_.contains(instruction))
         << "Instruction " << instruction->name() << " is not in computation "
         << name();
-    TF_RET_CHECK(visited.count(instruction) == 0)
+    TF_RET_CHECK(!visited.contains(instruction))
         << "Instruction " << instruction->name()
         << " appears more than once in order";
     HloInstruction* mutable_instruction =
@@ -847,7 +846,7 @@ std::unique_ptr<HloComputation> HloComputation::Clone(
   return CloneWithReplacements(
       /*replacements=*/std::unordered_map<const HloInstruction*,
                                           std::unique_ptr<HloInstruction>>(),
-      context, suffix);
+      /*extra_parameters=*/{}, context, suffix);
 }
 
 std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
@@ -856,7 +855,8 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
   std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
       replacements;
   replacements.emplace(std::move(r1));
-  return CloneWithReplacements(std::move(replacements), context, suffix);
+  return CloneWithReplacements(std::move(replacements), /*extra_parameters=*/{},
+                               context, suffix);
 }
 
 std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
@@ -867,7 +867,8 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
       replacements;
   replacements.emplace(std::move(r1));
   replacements.emplace(std::move(r2));
-  return CloneWithReplacements(std::move(replacements), context, suffix);
+  return CloneWithReplacements(std::move(replacements), /*extra_parameters=*/{},
+                               context, suffix);
 }
 
 std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
@@ -880,12 +881,14 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
   replacements.emplace(std::move(r1));
   replacements.emplace(std::move(r2));
   replacements.emplace(std::move(r3));
-  return CloneWithReplacements(std::move(replacements), context, suffix);
+  return CloneWithReplacements(std::move(replacements), /*extra_parameters=*/{},
+                               context, suffix);
 }
 
 std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
     std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
         replacements,
+    absl::Span<const HloInstruction* const> extra_parameters,
     HloCloneContext* context, const string& suffix) {
   std::unique_ptr<HloCloneContext> context_ptr;
   if (context == nullptr) {
@@ -951,6 +954,12 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
   }
 
   std::vector<std::unique_ptr<HloInstruction>> instructions;
+  // First add the extra parameters to 'instructions'.
+  for (const auto& instr : extra_parameters) {
+    CHECK_EQ(instr->opcode(), HloOpcode::kParameter)
+        << "Only parameter instructions are allowed in 'extra_parameters'";
+    instructions.emplace_back(instr->Clone());
+  }
   for (auto instr : postorder) {
     std::vector<HloInstruction*> new_operands;
     for (auto operand : instr->operands()) {
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index a0ccbc583f8c409f29d31756fcc1fa1b4af7dc35..e6a1eb89cfdb474f79c184ea0eb77dba8ccd5f03 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -323,11 +323,15 @@ class HloComputation {
   // that's not already in the computation, it's cloned and added to the new
   // computation.
   //
+  // 'extra_parameters' allows to specify additional parameters that should be
+  // added to the computation.
+  //
   // All relevant instructions are cloned, *including* unique_ptr in the
   // `replacements` map.
   std::unique_ptr<HloComputation> CloneWithReplacements(
       std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
           replacements,
+      absl::Span<const HloInstruction* const> extra_parameters = {},
       HloCloneContext* context = nullptr, const string& suffix = "clone");
 
   // Convenience overloads for CloneWithReplacements.  You want to do
diff --git a/tensorflow/compiler/xla/service/hlo_computation_test.cc b/tensorflow/compiler/xla/service/hlo_computation_test.cc
index 0361c87428f6e4c031d95492a5bc782ad388e5b5..251c7bbec418d8c3e8b27277160e608840726996 100644
--- a/tensorflow/compiler/xla/service/hlo_computation_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation_test.cc
@@ -15,8 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 
+#include <memory>
 #include <set>
+#include <unordered_map>
+#include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -226,7 +230,7 @@ TEST_F(HloComputationTest, VisitWithMultipleRoots) {
         : computation_(computation) {}
 
     Status DefaultAction(HloInstruction* hlo_instruction) override {
-      EXPECT_EQ(0, visited_set_.count(hlo_instruction));
+      EXPECT_FALSE(visited_set_.contains(hlo_instruction));
       visited_set_.insert(hlo_instruction);
       last_visited_ = hlo_instruction;
       return Status::OK();
@@ -239,7 +243,7 @@ TEST_F(HloComputationTest, VisitWithMultipleRoots) {
     }
 
     HloComputation* computation_;
-    std::set<HloInstruction*> visited_set_;
+    absl::flat_hash_set<HloInstruction*> visited_set_;
     int64 finish_visit_calls_ = 0;
     HloInstruction* last_visited_ = nullptr;
   };
@@ -491,6 +495,41 @@ TEST_F(HloComputationTest, CloneWithControlDependency) {
   EXPECT_THAT(successors, ::testing::ElementsAre(cloned_add));
 }
 
+TEST_F(HloComputationTest, CloneWithReplacements) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape r0s64 = ShapeUtil::MakeShape(S64, {});
+  Shape r0s32 = ShapeUtil::MakeShape(S32, {});
+  Shape r0u32 = ShapeUtil::MakeShape(U32, {});
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32_, "p.0.lhs"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0f32_, "p.0.rhs"));
+  auto param2 =
+      builder.AddInstruction(HloInstruction::CreateParameter(2, r0s64, "p.1"));
+  auto lt = builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(PRED, {}), HloOpcode::kLt, param0, param1));
+  auto module = CreateNewVerifiedModule();
+  auto computation =
+      module->AddEntryComputation(builder.Build(/*root_instruction=*/lt));
+  std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      replacements;
+  replacements.emplace(param2,
+                       HloInstruction::CreateParameter(2, r0s32, "p.1"));
+  auto param3 = HloInstruction::CreateParameter(3, r0u32, "p.2");
+  std::vector<const HloInstruction*> extra_parameters{param3.get()};
+  auto clone = computation->CloneWithReplacements(std::move(replacements),
+                                                  extra_parameters);
+  ASSERT_EQ(clone->num_parameters(), 4);
+  EXPECT_TRUE(
+      ShapeUtil::Equal(clone->parameter_instruction(0)->shape(), r0f32_));
+  EXPECT_TRUE(
+      ShapeUtil::Equal(clone->parameter_instruction(1)->shape(), r0f32_));
+  EXPECT_TRUE(
+      ShapeUtil::Equal(clone->parameter_instruction(2)->shape(), r0s32));
+  EXPECT_TRUE(
+      ShapeUtil::Equal(clone->parameter_instruction(3)->shape(), r0u32));
+}
+
 TEST_F(HloComputationTest, Stringification) {
   const Shape s1 = ShapeUtil::MakeShape(F32, {5, 10});
   const Shape s2 = ShapeUtil::MakeShape(F32, {20, 10});
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
index c58d00ff5084d060498f2d6fbbfa8e12207b810e..e7ed858e8c5af83d08863d64a0aba162c75ed5cb 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
@@ -35,6 +35,34 @@ limitations under the License.
 
 namespace xla {
 
+// Checks whether instr is or transitively contains an instruction that we
+// shouldn't fold.
+//
+// Specifically, we don't fold kRng or kAfterAll instructions:
+//
+//  - kRng is already marked as side-effecting and so is skipped elsewhere, but
+//    we check for it here.  Even kRng weren't side-effecting and took an
+//    explicit seed, we *still* wouldn't want to constant-fold it, because the
+//    evaluator's handling of rng is not guaranteed to be identical to any
+//    particular backend's rng.
+//
+//  - kAfterAll needs to be skipped because a kAfterAll op with no args can
+//    currently materialize a token "out of thin air".  TODO(b/110532604):
+//    Remove this check once AfterAll requires at least one operand, in which
+//    case constant folding will be impossible.
+static bool IsOrContainsIllegalInstr(const HloInstruction* instr) {
+  if (instr->opcode() == HloOpcode::kAfterAll ||
+      instr->opcode() == HloOpcode::kRng) {
+    return true;
+  }
+  for (const HloComputation* c : instr->called_computations()) {
+    if (absl::c_any_of(c->instructions(), IsOrContainsIllegalInstr)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
   // Limit the constant folding to 0 iterations to skip folding loops. This
   // retains the behavior from before while loop support in HloEvaluator and may
@@ -52,25 +80,24 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
           computation->root_instruction() != instruction) {
         continue;
       }
-      // Skip Constant, Parameter, Tuple, AfterAll operation.
-      // Tuple constants are not directly supported by any backends, hence
-      // folding Tuple is not useful and would in fact be expanded back into
-      // kTuple by Algebraic Simplifier.
-      // TODO(b/110532604): Enable AfterAll once AfterAll requires at least one
-      // operand in which case constant folding will be impossible and this
-      // special case is not necessary.
-      if (instruction->opcode() == HloOpcode::kParameter ||
-          instruction->opcode() == HloOpcode::kConstant ||
-          instruction->opcode() == HloOpcode::kTuple ||
-          instruction->opcode() == HloOpcode::kAfterAll) {
-        continue;
-      }
 
       // Skip instructions with non-constant operands.
       if (!hlo_query::AllOperandsAreConstants(*instruction)) {
         continue;
       }
 
+      // Don't fold Constant, Parameter, and Tuple instructions.  Tuple
+      // constants are not directly supported by any backends, hence folding
+      // Tuple is not useful and would in fact be expanded back into kTuple by
+      // Algebraic Simplifier.
+      //
+      // (We do allow folding subcomputations that contain these instructions.)
+      if (instruction->opcode() == HloOpcode::kParameter ||
+          instruction->opcode() == HloOpcode::kConstant ||
+          instruction->opcode() == HloOpcode::kTuple) {
+        continue;
+      }
+
       // Broadcasts dramatically increase the size of constants, which is often
       // detrimental to performance and memory capacity, so do not fold
       // broadcasts.
@@ -79,6 +106,18 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
         continue;
       }
 
+      // Check for instructions that we can't fold even if they appear inside of
+      // a subcomputation (e.g. a kCall).
+      if (IsOrContainsIllegalInstr(instruction)) {
+        continue;
+      }
+
+      // Don't constant-fold side-effecting instructions or instructions which
+      // contain side-effecting instructions.
+      if (instruction->HasSideEffect()) {
+        continue;
+      }
+
       // Don't constant fold unless it's a net positive or the output is small.
       if (instruction->shape().IsArray()) {
         int64 elements_in_removed_operands = 0;
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
index 92b748d813c3efef83ef0155f1d5d3c637ce2c57..4bdc980c9ac4fb79cde0242f407aea7057474b27 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
@@ -268,5 +268,51 @@ TEST_F(HloConstantFoldingTest, DoesNotFoldLargePad) {
               GmockMatch(m::Pad(m::Constant(), m::Constant())));
 }
 
+TEST_F(HloConstantFoldingTest, DontFoldSubcomputationContainingAfterAll) {
+  const char* const kModuleStr = R"(
+  HloModule test
+
+  Fn {
+    tok = token[] after-all()
+    ROOT root = f32[10] iota(), iota_dimension=0
+  }
+
+  ENTRY entry {
+    ROOT call = f32[10] call(), to_apply=Fn
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  HloConstantFolding constant_folding;
+  TF_ASSERT_OK_AND_ASSIGN(bool result,
+                          RunHloPass(&constant_folding, module.get()));
+  EXPECT_FALSE(result);
+}
+
+TEST_F(HloConstantFoldingTest,
+       DontFoldSubcomputationTransitivelyContainingRng) {
+  const char* const kModuleStr = R"(
+  HloModule test
+
+  InnerFn {
+    c0 = f32[] constant(0)
+    c1 = f32[] constant(1)
+    ROOT rng = f32[10] rng(c0, c1), distribution=rng_uniform
+  }
+
+  Fn {
+    ROOT fusion = f32[10] fusion(), kind=kLoop, calls=InnerFn
+  }
+
+  ENTRY entry {
+    ROOT call = f32[10] call(), to_apply=Fn
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  HloConstantFolding constant_folding;
+  TF_ASSERT_OK_AND_ASSIGN(bool result,
+                          RunHloPass(&constant_folding, module.get()));
+  EXPECT_FALSE(result);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.cc b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
index 1678fba1728e161b1f448079f366e8a68db03414..bb5d21c654c73da257d53e4f8486b2e83019b534 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -105,12 +106,26 @@ StatusOr<HloInstruction*> MakeDynamicSliceHlo(
     absl::Span<const int64> slice_sizes) {
   HloComputation* computation = operand->parent();
   CHECK_EQ(computation, start_indices->parent());
+  int64 rank = start_indices->shape().dimensions(0);
+  std::vector<HloInstruction*> scalar_start_indices;
+  for (int i = 0; i < rank; ++i) {
+    // TODO(b/118437727): Update callers to provide scalars directly.
+    auto slice = computation->AddInstruction(HloInstruction::CreateSlice(
+        ShapeUtil::MakeShape(start_indices->shape().element_type(), {1}),
+        start_indices, {i}, {i + 1}, {1}));
+    scalar_start_indices.push_back(
+        computation->AddInstruction(HloInstruction::CreateReshape(
+            ShapeUtil::MakeShape(start_indices->shape().element_type(), {}),
+            slice)));
+  }
+  std::vector<Shape> scalar_start_indices_shapes(
+      rank, ShapeUtil::MakeShape(start_indices->shape().element_type(), {}));
   TF_ASSIGN_OR_RETURN(
       Shape dynamic_slice_shape,
       ShapeInference::InferDynamicSliceShape(
-          operand->shape(), start_indices->shape(), slice_sizes));
+          operand->shape(), scalar_start_indices_shapes, slice_sizes));
   return computation->AddInstruction(HloInstruction::CreateDynamicSlice(
-      dynamic_slice_shape, operand, start_indices, slice_sizes));
+      dynamic_slice_shape, operand, scalar_start_indices, slice_sizes));
 }
 
 StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
@@ -119,12 +134,26 @@ StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
   HloComputation* computation = operand->parent();
   CHECK_EQ(computation, update->parent());
   CHECK_EQ(computation, start_indices->parent());
+  int64 rank = start_indices->shape().dimensions(0);
+  std::vector<HloInstruction*> scalar_start_indices;
+  for (int i = 0; i < rank; ++i) {
+    // TODO(b/118437727): Update callers to provide scalars directly.
+    auto slice = computation->AddInstruction(HloInstruction::CreateSlice(
+        ShapeUtil::MakeShape(start_indices->shape().element_type(), {1}),
+        start_indices, {i}, {i + 1}, {1}));
+    scalar_start_indices.push_back(
+        computation->AddInstruction(HloInstruction::CreateReshape(
+            ShapeUtil::MakeShape(start_indices->shape().element_type(), {}),
+            slice)));
+  }
+  std::vector<Shape> scalar_start_indices_shapes(
+      rank, ShapeUtil::MakeShape(start_indices->shape().element_type(), {}));
   TF_ASSIGN_OR_RETURN(
       Shape dynamic_update_slice_shape,
       ShapeInference::InferDynamicUpdateSliceShape(
-          operand->shape(), update->shape(), start_indices->shape()));
+          operand->shape(), update->shape(), scalar_start_indices_shapes));
   return computation->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-      dynamic_update_slice_shape, operand, update, start_indices));
+      dynamic_update_slice_shape, operand, update, scalar_start_indices));
 }
 
 StatusOr<HloInstruction*> MakeBroadcastHlo(
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
index aaa9ec60eb3c4e0159ed40b37d772e0973d306ec..3715e12b4e2baf7bc2149237457c16c3919c5083 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
@@ -56,9 +56,9 @@ TEST_F(HloCreationUtilsTest, CollapseFirst1Dim) {
   entry_computation->set_root_instruction(first_1_dims_collapsed);
 
   HloEvaluator evaluator;
-  TF_ASSERT_OK_AND_ASSIGN(Literal result_literal,
-                          evaluator.Evaluate<Literal>(
-                              *module, {LiteralUtil::CreateR1<int32>({3, 4})}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal result_literal,
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR1<int32>({3, 4})}));
   CHECK_EQ(result_literal, LiteralUtil::CreateR1<int32>({3, 4}));
 }
 
@@ -77,10 +77,9 @@ TEST_F(HloCreationUtilsTest, CollapseFirst2Dims) {
   HloEvaluator evaluator;
   TF_ASSERT_OK_AND_ASSIGN(
       Literal result_literal,
-      evaluator.Evaluate<Literal>(
-          *module,
-          {LiteralUtil::CreateR3<int32>(
-              {{{1, 2}, {3, 4}, {5, 6}}, {{-1, -2}, {-3, -4}, {-5, -6}}})}));
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR3<int32>(
+                                      {{{1, 2}, {3, 4}, {5, 6}},
+                                       {{-1, -2}, {-3, -4}, {-5, -6}}})}));
   CHECK_EQ(result_literal,
            LiteralUtil::CreateR2<int32>(
                {{1, 2}, {3, 4}, {5, 6}, {-1, -2}, {-3, -4}, {-5, -6}}));
@@ -101,8 +100,7 @@ TEST_F(HloCreationUtilsTest, Prepend1DegenerateDim) {
   HloEvaluator evaluator;
   TF_ASSERT_OK_AND_ASSIGN(
       Literal result_literal,
-      evaluator.Evaluate<Literal>(*module,
-                                  {LiteralUtil::CreateR1<int32>({9, 10})}));
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR1<int32>({9, 10})}));
   CHECK_EQ(result_literal, LiteralUtil::CreateR2<int32>({{9, 10}}));
 }
 
@@ -121,8 +119,7 @@ TEST_F(HloCreationUtilsTest, Prepend2DegenerateDims) {
   HloEvaluator evaluator;
   TF_ASSERT_OK_AND_ASSIGN(
       Literal result_literal,
-      evaluator.Evaluate<Literal>(*module,
-                                  {LiteralUtil::CreateR1<int32>({9, 10})}));
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR1<int32>({9, 10})}));
   CHECK_EQ(result_literal, LiteralUtil::CreateR3<int32>({{{9, 10}}}));
 }
 
@@ -141,7 +138,7 @@ TEST_F(HloCreationUtilsTest, Prepend2DegenerateDimsToScalar) {
   HloEvaluator evaluator;
   TF_ASSERT_OK_AND_ASSIGN(
       Literal result_literal,
-      evaluator.Evaluate<Literal>(*module, {LiteralUtil::CreateR0<int32>(9)}));
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR0<int32>(9)}));
   CHECK_EQ(result_literal, LiteralUtil::CreateR2<int32>({{9}}));
 }
 
@@ -160,8 +157,8 @@ TEST_F(HloCreationUtilsTest, ExpandFirstDimInto3Dims) {
   HloEvaluator evaluator;
   TF_ASSERT_OK_AND_ASSIGN(
       Literal result_literal,
-      evaluator.Evaluate<Literal>(
-          *module, {LiteralUtil::CreateR1<int32>({1, 2, 3, 4, 5, 6})}));
+      evaluator.Evaluate(*module,
+                         {LiteralUtil::CreateR1<int32>({1, 2, 3, 4, 5, 6})}));
   CHECK_EQ(result_literal,
            LiteralUtil::CreateR3<int32>({{{1, 2}}, {{3, 4}}, {{5, 6}}}));
 }
@@ -180,9 +177,9 @@ TEST_F(HloCreationUtilsTest, PadVectorWithZeros) {
   entry_computation->set_root_instruction(zero_padded_param);
 
   HloEvaluator evaluator;
-  TF_ASSERT_OK_AND_ASSIGN(Literal result_literal,
-                          evaluator.Evaluate<Literal>(
-                              *module, {LiteralUtil::CreateR1<int32>({3, 4})}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal result_literal,
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR1<int32>({3, 4})}));
   CHECK_EQ(result_literal, LiteralUtil::CreateR1<int32>({0, 0, 0, 3, 4, 0}));
 }
 
@@ -202,7 +199,7 @@ TEST_F(HloCreationUtilsTest, BroadcastZeros_S32) {
   HloEvaluator evaluator;
   TF_ASSERT_OK_AND_ASSIGN(
       Literal result_literal,
-      evaluator.Evaluate<Literal>(*module, {LiteralUtil::CreateR0<int32>(0)}));
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR0<int32>(0)}));
   CHECK_EQ(result_literal, LiteralUtil::CreateR2<int32>({{0, 0}, {0, 0}}));
 }
 
@@ -220,9 +217,9 @@ TEST_F(HloCreationUtilsTest, BroadcastZeros_F32) {
   entry_computation->set_root_instruction(zeros);
 
   HloEvaluator evaluator;
-  TF_ASSERT_OK_AND_ASSIGN(Literal result_literal,
-                          evaluator.Evaluate<Literal>(
-                              *module, {LiteralUtil::CreateR0<float>(0.0f)}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal result_literal,
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR0<float>(0.0f)}));
   CHECK_EQ(result_literal,
            LiteralUtil::CreateR2<float>({{0.0f, 0.0f}, {0.0f, 0.0f}}));
 }
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index 1924204df044faa6d55f11c1180e2ecc1f0e9e64..3144a84805454488f417391f40ed6b9e9facc752 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -107,7 +107,7 @@ bool HloDataflowAnalysis::AreTransitiveUsesElementwiseOrTuple(
           return false;
         }
       }
-      if (!visited.count(user)) {
+      if (!visited.contains(user)) {
         stack.push_back(user);
       }
     }
@@ -256,7 +256,7 @@ bool HloDataflowAnalysis::Phi(
         input_value_ids.push_back(value->id());
       }
     }
-    std::sort(input_value_ids.begin(), input_value_ids.end());
+    absl::c_sort(input_value_ids);
     input_value_ids.erase(
         std::unique(input_value_ids.begin(), input_value_ids.end()),
         input_value_ids.end());
@@ -271,8 +271,7 @@ bool HloDataflowAnalysis::Phi(
     if (current_value_defined_here) {
       VLOG(5) << "current_value_defined_here: " << current_value->ToString();
       CHECK(current_value->is_phi());
-      auto it = std::find(input_value_ids.begin(), input_value_ids.end(),
-                          current_value->id());
+      auto it = absl::c_find(input_value_ids, current_value->id());
       if (it != input_value_ids.end()) {
         input_value_ids.erase(it);
       }
@@ -921,8 +920,7 @@ StatusOr<std::unique_ptr<HloDataflowAnalysis>> HloDataflowAnalysis::Run(
   for (auto& pair : dataflow_analysis->values_) {
     dataflow_analysis->values_vector_.push_back(&pair.second);
   }
-  std::sort(dataflow_analysis->values_vector_.begin(),
-            dataflow_analysis->values_vector_.end(), HloValue::IdLessThan);
+  absl::c_sort(dataflow_analysis->values_vector_, HloValue::IdLessThan);
 
   TF_DCHECK_OK(dataflow_analysis->Verify());
 
@@ -937,9 +935,7 @@ Status HloDataflowAnalysis::Verify() const {
   for (const HloValue* value : values()) {
     for (const HloPosition& position : value->positions()) {
       const HloValueSet& value_set = GetValueSet(position);
-      TF_RET_CHECK(std::find(value_set.values().begin(),
-                             value_set.values().end(),
-                             value) != value_set.values().end())
+      TF_RET_CHECK(absl::c_linear_search(value_set.values(), value))
           << "Value set at position " << position << " does not contain value "
           << value->ToShortString();
     }
@@ -954,9 +950,7 @@ Status HloDataflowAnalysis::Verify() const {
         const HloValueSet& value_set = pair.second;
         const HloPosition position{instruction, index};
         for (const HloValue* value : value_set.values()) {
-          TF_RET_CHECK(std::find(value->positions().begin(),
-                                 value->positions().end(),
-                                 position) != value->positions().end())
+          TF_RET_CHECK(absl::c_linear_search(value->positions(), position))
               << "Value set at position " << position
               << " unexpectedly contains value " << value->ToShortString();
         }
@@ -1041,11 +1035,10 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
       // Check if one operand of kAdd fused root is kDot or kConvolution.
       auto* add = user->fused_expression_root();
       auto add_operand_it =
-          std::find_if(add->operands().begin(), add->operands().end(),
-                       [&](HloInstruction* operand) {
-                         return operand->opcode() == HloOpcode::kConvolution ||
-                                operand->opcode() == HloOpcode::kDot;
-                       });
+          absl::c_find_if(add->operands(), [&](HloInstruction* operand) {
+            return operand->opcode() == HloOpcode::kConvolution ||
+                   operand->opcode() == HloOpcode::kDot;
+          });
       if (add_operand_it == add->operands().end()) {
         return false;
       }
@@ -1100,16 +1093,15 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
     // *) The root instruction of the called computation is element-wise on
     //    'operand'.
     const bool found_caller_use =
-        std::find_if(uses.begin(), uses.end(), [user](const HloUse& use) {
+        absl::c_find_if(uses, [user](const HloUse& use) {
           return use.instruction == user;
         }) != uses.end();
     auto* callee_root = user->to_apply()->root_instruction();
     const bool found_elementwise_callee_use =
-        std::find_if(
-            uses.begin(), uses.end(), [callee_root](const HloUse& use) {
-              return use.instruction == callee_root &&
-                     callee_root->IsElementwiseOnOperand(use.operand_number);
-            }) != uses.end();
+        absl::c_find_if(uses, [callee_root](const HloUse& use) {
+          return use.instruction == callee_root &&
+                 callee_root->IsElementwiseOnOperand(use.operand_number);
+        }) != uses.end();
     return uses.size() == 2 && found_caller_use && found_elementwise_callee_use;
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 1d165ac35f766af857dc0984d3f7012ad945b3c2..888886865b9cd7d09af6d3b5f016b60ccef5facd 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -1970,12 +1970,13 @@ TEST_F(DoesNotUseOperandBufferTest, FusedDynamicUpdateSlice) {
 
   // Create a DynamicUpdateSlice instruction of tuple element 1.
   auto starts = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(2)));
   auto update = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f})));
   auto dynamic_update_slice =
       builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-          data_shape, gte1, update, starts));
+          data_shape, gte1, update,
+          std::initializer_list<HloInstruction*>({starts})));
   builder.AddInstruction(
       HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
 
@@ -2012,12 +2013,13 @@ TEST_F(DoesNotUseOperandBufferTest, IndirectUses) {
 
   // Create a DynamicUpdateSlice instruction of tuple element 1.
   auto starts = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(2)));
   auto update = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f})));
   auto dynamic_update_slice =
       builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-          data_shape, gte1, update, starts));
+          data_shape, gte1, update,
+          std::initializer_list<HloInstruction*>({starts})));
   builder.AddInstruction(
       HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
 
@@ -2150,17 +2152,17 @@ TEST_F(CanShareOperandBufferWithUserTest,
 
   auto param = builder.AddInstruction(
       HloInstruction::CreateParameter(0, data_shape, "param0"));
-  auto index = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int64>({0, 0})));
-  auto ds = builder.AddInstruction(
-      HloInstruction::CreateDynamicSlice(slice_shape, param, index, {1, 2, 2}));
+  auto zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int64>(0)));
+  auto ds = builder.AddInstruction(HloInstruction::CreateDynamicSlice(
+      slice_shape, param, {zero, zero}, {1, 2, 2}));
 
-  auto dus = builder.AddInstruction(
-      HloInstruction::CreateDynamicUpdateSlice(data_shape, param, ds, index));
+  auto dus = builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+      data_shape, param, ds, {zero, zero}));
 
   BuildModule(builder.Build());
   auto fusion = computation_->CreateFusionInstruction(
-      {dus, ds, index}, HloInstruction::FusionKind::kLoop);
+      {dus, ds, zero}, HloInstruction::FusionKind::kLoop);
   RunAnalysis();
 
   EXPECT_TRUE(
@@ -2219,12 +2221,13 @@ TEST_F(CanShareOperandBufferWithUserTest, FusedDynamicUpdateSlice) {
 
   // Create a DynamicUpdateSlice instruction of tuple element 1.
   auto starts = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(2)));
   auto update = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f})));
   auto dynamic_update_slice =
       builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-          data_shape, gte1, update, starts));
+          data_shape, gte1, update,
+          std::initializer_list<HloInstruction*>({starts})));
   builder.AddInstruction(
       HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
 
@@ -2259,12 +2262,13 @@ TEST_F(CanShareOperandBufferWithUserTest,
 
   // Create a DynamicUpdateSlice instruction of tuple element 1.
   auto starts = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(2)));
   auto update = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f})));
   auto dynamic_update_slice =
       builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-          data_shape_bf16, convert1, update, starts));
+          data_shape_bf16, convert1, update,
+          std::initializer_list<HloInstruction*>({starts})));
 
   auto convert2 = builder.AddInstruction(
       HloInstruction::CreateConvert(data_shape, dynamic_update_slice));
@@ -2290,10 +2294,13 @@ TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
       HloInstruction::CreateParameter(0, data_shape, "data"));
   auto update = builder.AddInstruction(
       HloInstruction::CreateParameter(1, update_shape, "update"));
-  auto starts = builder.AddInstruction(
-      HloInstruction::CreateParameter(2, starts_shape, "starts"));
+  auto start0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, starts_shape, "start0"));
+  auto start1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(3, starts_shape, "start1"));
+
   auto dus = builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-      data_shape, data, update, starts));
+      data_shape, data, update, {start0, start1}));
 
   BuildModuleAndRunAnalysis(builder.Build());
 
@@ -2304,7 +2311,9 @@ TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
   EXPECT_FALSE(
       dataflow_analysis_->CanShareOperandBufferWithUser(update, {}, dus, {}));
   EXPECT_FALSE(
-      dataflow_analysis_->CanShareOperandBufferWithUser(starts, {}, dus, {}));
+      dataflow_analysis_->CanShareOperandBufferWithUser(start0, {}, dus, {}));
+  EXPECT_FALSE(
+      dataflow_analysis_->CanShareOperandBufferWithUser(start1, {}, dus, {}));
 }
 
 TEST_F(CanShareOperandBufferWithUserTest, ScatterCanShare) {
diff --git a/tensorflow/compiler/xla/service/hlo_dce.cc b/tensorflow/compiler/xla/service/hlo_dce.cc
index 7d35e251ca21951036336ff1a1eb4aabc87bc5ca..a5a11f09cf4f857b992e5ede3a9dbc5a937ce722 100644
--- a/tensorflow/compiler/xla/service/hlo_dce.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -65,7 +66,7 @@ StatusOr<bool> HloDCE::Run(HloModule* module) {
 
   // Now DCE HloComputations.  First, collect the computations that are
   // referenced by some remaining instruction.
-  std::unordered_set<HloComputation*> live_computations;
+  absl::flat_hash_set<HloComputation*> live_computations;
   if (HloComputation* entry_computation = module->entry_computation()) {
     live_computations.insert(entry_computation);
   }
@@ -79,7 +80,7 @@ StatusOr<bool> HloDCE::Run(HloModule* module) {
 
   // Remove dead computations.
   for (auto* computation : module->MakeComputationPostOrder()) {
-    if (live_computations.count(computation) == 0) {
+    if (!live_computations.contains(computation)) {
       TF_RETURN_IF_ERROR(module->RemoveEmbeddedComputation(computation));
       changed = true;
     }
diff --git a/tensorflow/compiler/xla/service/hlo_dce_test.cc b/tensorflow/compiler/xla/service/hlo_dce_test.cc
index 1fa4259a3e42286cbc911907eea563e6ca6f8611..b5d72b386f89568cc3066b2e497be98428d1ed0c 100644
--- a/tensorflow/compiler/xla/service/hlo_dce_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce_test.cc
@@ -43,9 +43,7 @@ class HloDceTest : public HloTestBase {
   // Returns whether the given instruction exists in the given computation.
   bool HasInstruction(const HloComputation& computation,
                       const HloInstruction* instruction) {
-    return std::find(computation.instructions().begin(),
-                     computation.instructions().end(),
-                     instruction) != computation.instructions().end();
+    return absl::c_linear_search(computation.instructions(), instruction);
   }
 };
 
diff --git a/tensorflow/compiler/xla/service/hlo_domain_map.cc b/tensorflow/compiler/xla/service/hlo_domain_map.cc
index c6d02f9f67bb599e496d20fc2acf2e627ed54438..7cdb7f6bdf26241cda4fabbb5ccaf6e6f7de39ce 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_map.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_map.cc
@@ -230,10 +230,10 @@ HloDomainMap::MakeNonDomainInstructions(
     }
   }
   // sort instructions according to instructions_order
-  std::sort(instructions.begin(), instructions.end(),
-            [&instructions_order](HloInstruction* a, HloInstruction* b) {
-              return instructions_order.at(a) < instructions_order.at(b);
-            });
+  absl::c_sort(instructions,
+               [&instructions_order](HloInstruction* a, HloInstruction* b) {
+                 return instructions_order.at(a) < instructions_order.at(b);
+               });
   return instructions;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index cdb2df24db8b6bafa4c7e98c635435750cdf42ad..b8d95fb24479b375506cfe9a788540ebefafb8c1 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -138,6 +138,11 @@ StatusOr<Literal> Compare<complex64>(const Shape& shape, HloOpcode opcode,
 
 }  // namespace
 
+// Note that unsupported types by the typed visitor does not necessarily imply
+// the non-typed HloEvaluator (parent evaluator) would not support them either
+// in the type-agnostic handler. For e.g., HandleGetTupleElement in the parent
+// type-agnostic evaluator will be able to accept Tuple primitive type, whereas
+// HloEvaluatorTypedVisitor cannot.
 HloEvaluator::HloEvaluator(int64 max_loop_iterations)
     : max_loop_iterations_(max_loop_iterations) {
   typed_visitors_[PRED] =
@@ -198,99 +203,47 @@ HloEvaluator::HloEvaluator(int64 max_loop_iterations)
       });
 }
 
-template <typename LiteralPtr>
-StatusOr<Literal> HloEvaluator::Evaluate(
-    const HloModule& module, absl::Span<const LiteralPtr> arg_literals) {
-  XLA_VLOG_LINES(2, "HloEvaluator::Evaluate module:\n" + module.ToString());
-
-  evaluated_.clear();
-  arg_literals_.clear();
-  for (const auto& literal_ptr : arg_literals) {
-    arg_literals_.push_back(&*literal_ptr);
-  }
-
-  TF_RETURN_IF_ERROR(module.entry_computation()->Accept(this));
-
-  return GetEvaluatedLiteralFor(module.entry_computation()->root_instruction())
-      .Clone();
-}
-
-template <>
-StatusOr<Literal> HloEvaluator::Evaluate<Literal>(
-    const HloModule& module, absl::Span<const Literal> arg_literals) {
-  std::vector<const Literal*> arg_literal_ptrs;
-  for (const auto& literal_ptr : arg_literals) {
-    arg_literal_ptrs.push_back(&literal_ptr);
-  }
-  return Evaluate<const Literal*>(module, arg_literal_ptrs);
-}
-
-template <typename LiteralPtr>
 StatusOr<Literal> HloEvaluator::Evaluate(
     const HloComputation& computation,
-    absl::Span<const LiteralPtr> arg_literals) {
+    absl::Span<const Literal* const> arg_literals) {
   CHECK(computation.parent() != nullptr);
   XLA_VLOG_LINES(
       2, "HloEvaluator::Evaluate computation:\n" + computation.ToString());
 
-  evaluated_.clear();
-  arg_literals_.clear();
-  for (const auto& literal_ptr : arg_literals) {
-    arg_literals_.push_back(&*literal_ptr);
+  if (arg_literals.size() != computation.num_parameters()) {
+    return InvalidArgument(
+        "Expected %d argument%s, but got %d.", computation.num_parameters(),
+        computation.num_parameters() == 1 ? "" : "s", arg_literals.size());
   }
-
-  TF_RETURN_IF_ERROR(computation.Accept(this));
-  return GetEvaluatedLiteralFor(computation.root_instruction()).Clone();
-}
-
-template <>
-StatusOr<Literal> HloEvaluator::Evaluate<Literal>(
-    const HloComputation& computation, absl::Span<const Literal> arg_literals) {
-  std::vector<const Literal*> arg_literal_ptrs;
-  for (const auto& literal_ptr : arg_literals) {
-    arg_literal_ptrs.push_back(&literal_ptr);
+  for (int64 i = 0; i < arg_literals.size(); ++i) {
+    const auto& computation_shape =
+        computation.parameter_instruction(i)->shape();
+    const auto& arg_shape = arg_literals[i]->shape();
+    if (!ShapeUtil::Equal(computation_shape, arg_shape)) {
+      return InvalidArgument(
+          "Shape mismatch at parameter %d. Computation expected %s, but arg "
+          "was %s.",
+          i, ShapeUtil::HumanStringWithLayout(computation_shape),
+          ShapeUtil::HumanString(arg_shape));
+    }
   }
-  return Evaluate<const Literal*>(computation, arg_literal_ptrs);
-}
-
-template <typename LiteralPtr>
-StatusOr<Literal> HloEvaluator::Evaluate(
-    HloInstruction* instruction, absl::Span<const LiteralPtr> arg_literals) {
-  TF_RET_CHECK(hlo_query::AllOperandsAreParametersOrConstants(*instruction));
 
   evaluated_.clear();
   arg_literals_.clear();
   for (const auto& literal_ptr : arg_literals) {
     arg_literals_.push_back(&*literal_ptr);
   }
-
-  // Evaluate operands of Parameter type against the input literals which
-  // caches the evaluated literal results.
-  for (const auto operand : instruction->operands()) {
-    if (operand->opcode() == HloOpcode::kParameter) {
-      const Literal* input_literal = arg_literals_[operand->parameter_number()];
-      VLOG(2) << "Parameter operand evaluated to: "
-              << input_literal->ToString();
-      TF_RET_CHECK(ShapeUtil::Equal(operand->shape(), input_literal->shape()));
-
-      evaluated_[operand] = input_literal->Clone();
-    }
+  if (computation.parent()->config().seed()) {
+    seed_ = computation.parent()->config().seed();
+  } else {
+    std::random_device rd;
+    seed_ = rd();
   }
 
-  TF_RETURN_IF_ERROR(Preprocess(instruction));
-  TF_RETURN_IF_ERROR(instruction->Visit(this));
-  TF_RETURN_IF_ERROR(Postprocess(instruction));
-  return GetEvaluatedLiteralFor(instruction).Clone();
-}
+  engine_ = std::minstd_rand0(seed_);
 
-template <>
-StatusOr<Literal> HloEvaluator::Evaluate<Literal>(
-    HloInstruction* instruction, absl::Span<const Literal> arg_literals) {
-  std::vector<const Literal*> arg_literal_ptrs;
-  for (const auto& literal : arg_literals) {
-    arg_literal_ptrs.push_back(&literal);
-  }
-  return Evaluate<const Literal*>(instruction, arg_literal_ptrs);
+  TF_RETURN_IF_ERROR(computation.Accept(this));
+  return GetEvaluatedLiteralFor(computation.root_instruction()).Clone();
 }
 
 StatusOr<Literal> HloEvaluator::Evaluate(HloInstruction* instruction) {
@@ -408,6 +361,31 @@ Status HloEvaluator::HandleBitcast(HloInstruction* bitcast) {
   return Status::OK();
 }
 
+Status HloEvaluator::HandleGetDimensionSize(
+    HloInstruction* get_dimension_size) {
+  HloInstruction* operand = get_dimension_size->mutable_operand(0);
+  int64 dim = get_dimension_size->dimension();
+  if (dynamic_dimension_inference_ == nullptr) {
+    return InvalidArgument(
+        "Evaluator cannot evaluate get_dimension_size without "
+        "set_dynamic_dimension_inference.");
+  }
+  HloInstruction* dynamic_size =
+      dynamic_dimension_inference_->GetDynamicSize(operand, {}, dim);
+  if (dynamic_size != nullptr) {
+    evaluated_[get_dimension_size] =
+        GetEvaluatedLiteralFor(dynamic_size).Clone();
+    return Status::OK();
+  }
+
+  const Shape& shape = get_dimension_size->operand(0)->shape();
+  Literal output(ShapeUtil::MakeShape(U32, {}));
+  output.PopulateWithValue(
+      static_cast<uint32>(shape.dimensions(get_dimension_size->dimension())));
+  evaluated_[get_dimension_size] = std::move(output);
+  return Status::OK();
+}
+
 Status HloEvaluator::HandleParameter(HloInstruction* parameter) {
   CHECK_LT(parameter->parameter_number(), arg_literals_.size());
   const Literal* input_literal = arg_literals_[parameter->parameter_number()];
@@ -1127,9 +1105,10 @@ Status HloEvaluator::HandleCall(HloInstruction* call) {
   }
 
   HloEvaluator embedded_evaluator;
-  Literal result =
-      embedded_evaluator.Evaluate<const Literal*>(*computation, arg_literals)
-          .ConsumeValueOrDie();
+  embedded_evaluator.set_dynamic_dimension_inference(
+      dynamic_dimension_inference_);
+  Literal result = embedded_evaluator.Evaluate(*computation, arg_literals)
+                       .ConsumeValueOrDie();
 
   evaluated_[call] = std::move(result);
   return Status::OK();
@@ -1145,7 +1124,9 @@ Status HloEvaluator::HandleFusion(HloInstruction* fusion) {
       fusion->fused_instructions_computation()->Clone(
           /*suffix=*/"clone_with_layout", &context);
   for (auto* instruction : cloned_fused_computation->instructions()) {
-    LayoutUtil::SetToDefaultLayout(instruction->mutable_shape());
+    if (!LayoutUtil::HasLayout(instruction->shape())) {
+      LayoutUtil::SetToDefaultLayout(instruction->mutable_shape());
+    }
   }
   auto readded_computation =
       empty_hlo_module.AddEntryComputation(std::move(cloned_fused_computation));
@@ -1159,9 +1140,10 @@ Status HloEvaluator::HandleFusion(HloInstruction* fusion) {
   }
 
   HloEvaluator embedded_evaluator;
+  embedded_evaluator.set_dynamic_dimension_inference(
+      dynamic_dimension_inference_);
   Literal result =
-      embedded_evaluator
-          .Evaluate<const Literal*>(*readded_computation, arg_literals)
+      embedded_evaluator.Evaluate(*readded_computation, arg_literals)
           .ConsumeValueOrDie();
 
   evaluated_[fusion] = std::move(result);
@@ -1179,16 +1161,16 @@ Status HloEvaluator::HandleConditional(HloInstruction* conditional) {
   auto* false_computation = conditional->false_computation();
 
   HloEvaluator embedded_evaluator;
+  embedded_evaluator.set_dynamic_dimension_inference(
+      dynamic_dimension_inference_);
   Literal result;
   if (pred.Get<bool>({})) {
-    result = embedded_evaluator
-                 .Evaluate<const Literal*>(*true_computation,
-                                           {&true_computation_arg})
-                 .ConsumeValueOrDie();
+    result =
+        embedded_evaluator.Evaluate(*true_computation, {&true_computation_arg})
+            .ConsumeValueOrDie();
   } else {
     result = embedded_evaluator
-                 .Evaluate<const Literal*>(*false_computation,
-                                           {&false_computation_arg})
+                 .Evaluate(*false_computation, {&false_computation_arg})
                  .ConsumeValueOrDie();
   }
 
@@ -1235,18 +1217,21 @@ Status HloEvaluator::HandleWhile(HloInstruction* while_hlo) {
   bool keep_going = true;
   int64 iteration_count = 0;
   HloEvaluator cond_evaluator(max_loop_iterations_);
+  cond_evaluator.set_dynamic_dimension_inference(dynamic_dimension_inference_);
   HloEvaluator loop_body_evaluator(max_loop_iterations_);
+  loop_body_evaluator.set_dynamic_dimension_inference(
+      dynamic_dimension_inference_);
   while (keep_going) {
     if (max_loop_iterations_ >= 0 && iteration_count++ > max_loop_iterations_) {
       return InvalidArgument("Loop %s exceeded loop iteration limit (%d).",
                              while_hlo->name(), max_loop_iterations_);
     }
     TF_ASSIGN_OR_RETURN(auto cond_val,
-                        cond_evaluator.Evaluate<Literal*>(*cond_comp, {&lcv}));
+                        cond_evaluator.Evaluate(*cond_comp, {&lcv}));
     keep_going = cond_val.GetFirstElement<bool>();
     if (keep_going) {
-      TF_ASSIGN_OR_RETURN(auto body_val, loop_body_evaluator.Evaluate<Literal*>(
-                                             *body_comp, {&lcv}));
+      TF_ASSIGN_OR_RETURN(auto body_val,
+                          loop_body_evaluator.Evaluate(*body_comp, {&lcv}));
       VLOG(3) << "Loop iteration result: " << body_val.ToString();
       lcv = std::move(body_val);
       cond_evaluator.ResetVisitStates();
@@ -1297,8 +1282,7 @@ StatusOr<Literal> EvaluateSortInternal(HloInstruction* sort,
         // Extract a slice from the keys and values literals that correspond to
         // exactly the row in dimension 'sort_dim'.
         std::vector<int64> limit_indices(indices.begin(), indices.end());
-        std::for_each(limit_indices.begin(), limit_indices.end(),
-                      [](int64& index) { ++index; });
+        absl::c_for_each(limit_indices, [](int64& index) { ++index; });
         limit_indices[sort_dim] = sort_dim_elements;
         TF_ASSIGN_OR_RETURN(auto keys_to_sort,
                             keys_literal.Slice(indices, limit_indices)
@@ -1455,18 +1439,6 @@ Status HloEvaluator::Postprocess(HloInstruction* hlo) {
   return Status::OK();
 }
 
-// Explicit instantiation of templatized Evaluate* methods.
-//
-template StatusOr<Literal> HloEvaluator::Evaluate<const Literal*>(
-    const HloModule& module, absl::Span<const Literal* const> arg_literals);
-
-template StatusOr<Literal> HloEvaluator::Evaluate<const Literal*>(
-    const HloComputation& computation,
-    absl::Span<const Literal* const> arg_literals);
-
-template StatusOr<Literal> HloEvaluator::Evaluate<const Literal*>(
-    HloInstruction* instruction, absl::Span<const Literal* const> arg_literals);
-
 namespace {
 template <typename T>
 std::unique_ptr<Array2D<T>> MatmulArray2DImpl(
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index 829fc2aba787a057914a038a3b22e19c763517b4..604b861913051574b038bd64a1b9d5ce2e79dbf3 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -43,16 +44,24 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   // specified.
   explicit HloEvaluator(int64 max_loop_iterations = -1);
 
-  // Evaluates an HLO module and an array of pointers to literals.
-  // Returns the evaluated result as a literal if successful.
+  // Evaluates an HLO module and an array of pointers to literals.  Returns the
+  // evaluated result as a literal if successful.
+  //
   // Precondition: The indices of arg_literals correspond to the parameter
   // numbers of the HLO parameters in the computation. See comment below for an
   // example.
-  // `LiteralPtr` accepts either Literal or const Literal*
-  // type.
-  template <typename LiteralPtr>
+  //
+  // (Dummy template arg is to reduce the overloading priority of one overload
+  // so that Evaluate(module, {}) resolves unambiguously.)
+  StatusOr<Literal> Evaluate(const HloModule& module,
+                             absl::Span<const Literal* const> arg_literals) {
+    return Evaluate(*module.entry_computation(), arg_literals);
+  }
+  template <typename Dummy = void>
   StatusOr<Literal> Evaluate(const HloModule& module,
-                             absl::Span<const LiteralPtr> arg_literals);
+                             absl::Span<const Literal> arg_literals) {
+    return Evaluate(*module.entry_computation(), arg_literals);
+  }
 
   // Evaluates an HLO computation and an array of pointers to literals.
   // Returns the evaluated result as a literal if successful.
@@ -70,29 +79,24 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   // where Parameter0 has parameter_number 0 and Parameter1 has parameter_number
   // 1 in this computation. The input literals array will then have its first
   // literal map to Parameter0 and the second map to Parameter1.
-  // `LiteralPtr` accepts either Literal or const Literal*
-  // type.
-  template <typename LiteralPtr>
+  //
+  // (Dummy template arg is to reduce the overloading priority of one overload
+  // so that Evaluate(module, {}) resolves unambiguously.)
   StatusOr<Literal> Evaluate(const HloComputation& computation,
-                             absl::Span<const LiteralPtr> arg_literals);
-
-  // Evaluates a single HLO instruction and an array of pointers to literals.
-  // Return the evaluated result as literal if successful.
-  // Precondition:
-  // 1. argument literals correspond to the input instruction's parameters in
-  // their post-ordering.
-  // 2. the instruction's operands must be of either Parameter or Constant type.
-  // `LiteralPtr` accepts either Literal or const Literal*
-  // type.
-  template <typename LiteralPtr>
-  StatusOr<Literal> Evaluate(HloInstruction* instruction,
-                             absl::Span<const LiteralPtr> arg_literals);
-
-  // Evaluates a single HLO instruction with constant operands.
-  // Returns the evaluated result as literal if successful.
-  // Precondition:
-  // 1. all operands of the input instruction are constants.
-  // 2. the instruction is not a Parameter operation.
+                             absl::Span<const Literal* const> arg_literals);
+  template <typename Dummy = void>
+  StatusOr<Literal> Evaluate(const HloComputation& computation,
+                             absl::Span<const Literal> arg_literals) {
+    std::vector<const Literal*> arg_literal_ptrs;
+    for (const auto& l : arg_literals) {
+      arg_literal_ptrs.push_back(&l);
+    }
+    return Evaluate(computation, arg_literal_ptrs);
+  }
+
+  // Gets the value of running a single HLO instruction.
+  //
+  // All of the operands to this instruction must be constants.
   StatusOr<Literal> Evaluate(HloInstruction* instruction);
 
   // Same as Evaluate, except returning false on error and accepts an output
@@ -120,6 +124,11 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
                                   const PrecisionConfig& precision_config,
                                   const Literal& lhs, const Literal& rhs);
 
+  void set_dynamic_dimension_inference(
+      DynamicDimensionInference* dynamic_dimension_inference) {
+    dynamic_dimension_inference_ = dynamic_dimension_inference;
+  }
+
   // Enable the fast path for certain operations like dot or convolution.
   void set_use_fast_path(bool value) { use_fast_path_ = value; }
 
@@ -158,6 +167,8 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   //
   Status HandleBitcast(HloInstruction* bitcast) override;
 
+  Status HandleGetDimensionSize(HloInstruction* get_dimension_size) override;
+
   Status HandleParameter(HloInstruction* parameter) override;
 
   Status HandleConstant(HloInstruction* constant) override;
@@ -208,6 +219,29 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
 
   Status HandleReduce(HloInstruction* reduce) override;
 
+  // Unsupported HLOs, note some of them (such as BatchNorm*) are typically
+  // expanded in a semantic-preserving way into other HLOs by adding exanpsion
+  // HLO pass to the HLO optimization pass during compilation, which can then be
+  // handled by the evaluator.
+  Status HandleBatchNormGrad(HloInstruction* batch_norm_grad) override {
+    return Unimplemented("BatchNormGrad HLO is unsupported by the evaluator.");
+  };
+  Status HandleBatchNormInference(
+      HloInstruction* batch_norm_inference) override {
+    return Unimplemented(
+        "BatchNormInference HLO is unsupported by the evaluator.");
+  };
+  Status HandleBatchNormTraining(HloInstruction* batch_norm_training) override {
+    return Unimplemented(
+        "BatchNormTraining HLO is unsupported by the evaluator.");
+  };
+  Status HandleInfeed(HloInstruction* infeed) override {
+    return Unimplemented("Infeed HLO is unsupported by the evaluator.");
+  };
+  Status HandleOutfeed(HloInstruction* outfeed) override {
+    return Unimplemented("Outfeed HLO is unsupported by the evaluator.");
+  };
+
   // Returns the already-evaluated literal result for the instruction.
   // A Constant instruction is considered evaluated and its literal will be
   // returned directly without looking up the cache.
@@ -264,6 +298,15 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   // Max loop iterations to execute with no maximum if negative.
   int64 max_loop_iterations_;
 
+  // Module-level seed handle.
+  uint64 seed_;
+  // RNG engine.
+  std::minstd_rand0 engine_;
+
+  // DynamicDimensionInference is used to evaluate GetDimensionSize, which
+  // returns the dynamic dimension size of its operand.
+  DynamicDimensionInference* dynamic_dimension_inference_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(HloEvaluator);
 };
 
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index 7c56a0cc36f76069613dee608941ce83a8d1d654..674df0016afb5ed6379ded74987ae4a8eef2669a 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -51,20 +51,18 @@ namespace {
 
 static std::array<bool, 2> use_bf16_params{true, false};
 
-class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
-                         public HloTestBase {
- protected:
-  HloEvaluatorTest() : HloTestBase(), use_bfloat16_(GetParam()) {
-    evaluator_ = absl::make_unique<HloEvaluator>();
-  }
+// Test fixture for the HloEvaluator.
+//
+// In bf16 mode, all f32 shapes are converted to bf16 before running.
+class HloEvaluatorTest : public HloTestBase {
+ public:
+  HloEvaluatorTest() : use_bfloat16_(false) {}
 
   Literal Evaluate(absl::Span<const Literal* const> arg_literals = {}) {
     if (use_bfloat16_) {
-      // In BF16 mode, we convert all F32 type to BF16 and evaluate the module.
-      auto type_converter = HloElementTypeConverter(F32, BF16);
-      type_converter.Run(m_.get()).ValueOrDie();
+      HloElementTypeConverter(F32, BF16).Run(m_.get()).ValueOrDie();
     }
-    return evaluator_->Evaluate(*m_->entry_computation(), arg_literals)
+    return evaluator_.Evaluate(*m_->entry_computation(), arg_literals)
         .ConsumeValueOrDie();
   }
 
@@ -74,16 +72,12 @@ class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
   Literal EvaluateWithModule(
       HloModule* module, absl::Span<const Literal* const> arg_literals = {}) {
     if (use_bfloat16_) {
-      // In BF16 mode, we convert all F32 type to BF16 and evaluate the module.
-      auto type_converter = HloElementTypeConverter(F32, BF16);
-      type_converter.Run(module).ValueOrDie();
+      HloElementTypeConverter(F32, BF16).Run(m_.get()).ValueOrDie();
     }
-    return evaluator_->Evaluate(*module->entry_computation(), arg_literals)
+    return evaluator_.Evaluate(*module->entry_computation(), arg_literals)
         .ConsumeValueOrDie();
   }
 
-  std::unique_ptr<HloEvaluator> evaluator_;
-
   void TestUnaryOp(HloOpcode opcode, Literal expected, Literal input,
                    float aabs = 0) {
     HloComputation::Builder b(TestName());
@@ -117,16 +111,27 @@ class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
     EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
   }
 
-  bool use_bfloat16_;
+ protected:
+  explicit HloEvaluatorTest(bool use_bfloat16) : use_bfloat16_(use_bfloat16) {}
+  HloEvaluator evaluator_;
+
+  const bool use_bfloat16_;
   std::unique_ptr<HloModule> m_ = CreateNewVerifiedModule();
 };
 
-#define XLA_TYPED_TEST_P(test_case_name, test_name, test_type1) \
-  TEST_P(test_case_name, test_name)
+// Lets you write TEST_Ps that run twice, once with and once without bf16.
+class HloEvaluatorBf16Test : public ::testing::WithParamInterface<bool>,
+                             public HloEvaluatorTest {
+ protected:
+  HloEvaluatorBf16Test() : HloEvaluatorTest(/*use_bfloat16=*/GetParam()) {}
+};
+
+INSTANTIATE_TEST_CASE_P(HloEvaluatorTest_Instantiation, HloEvaluatorBf16Test,
+                        ::testing::ValuesIn(use_bf16_params));
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs clamp
 // with 3 operands.
-TEST_P(HloEvaluatorTest, DoesClamp) {
+TEST_P(HloEvaluatorBf16Test, DoesClamp) {
   auto low = LiteralUtil::CreateR2<float>({{0.f, 2.f}, {2.f, 4.f}});
   auto value = LiteralUtil::CreateR2<float>({{0.f, 5.f}, {0.f, 4.f}});
   auto high = LiteralUtil::CreateR2<float>({{2.f, 4.f}, {4.f, 4.f}});
@@ -147,7 +152,7 @@ TEST_P(HloEvaluatorTest, DoesClamp) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, DISABLED_DoesClampSpecialBroadcast) {
+TEST_P(HloEvaluatorBf16Test, DISABLED_DoesClampSpecialBroadcast) {
   auto low = LiteralUtil::CreateR0<float>(0.f);
   auto value = LiteralUtil::CreateR2<float>({{-1.f, 0.f}, {1.f, 2.f}});
   auto high = LiteralUtil::CreateR0<float>(1.f);
@@ -170,7 +175,7 @@ TEST_P(HloEvaluatorTest, DISABLED_DoesClampSpecialBroadcast) {
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs select
 // with 3 operands.
-TEST_P(HloEvaluatorTest, DoesSelect) {
+TEST_P(HloEvaluatorBf16Test, DoesSelect) {
   auto pred = LiteralUtil::CreateR2<bool>({{true, false}, {false, true}});
   auto on_true = LiteralUtil::CreateR2<float>({{2.f, 4.f}, {4.f, 4.f}});
   auto on_false = LiteralUtil::CreateR2<float>({{0.f, 5.f}, {0.f, 4.f}});
@@ -195,7 +200,7 @@ TEST_P(HloEvaluatorTest, DoesSelect) {
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise addition with 2 operands.
-TEST_P(HloEvaluatorTest, DoesAdd) {
+TEST_F(HloEvaluatorTest, DoesAdd) {
   auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
   auto expected = LiteralUtil::CreateR2<int64>({{3, 4}, {-96, 8}});
@@ -204,7 +209,7 @@ TEST_P(HloEvaluatorTest, DoesAdd) {
 }
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise and with 2 operands.
-TEST_P(HloEvaluatorTest, DoesAnd) {
+TEST_P(HloEvaluatorBf16Test, DoesAnd) {
   auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
   auto expected = LiteralUtil::CreateR2<int64>({{0, 0}, {4, 4}});
@@ -213,7 +218,7 @@ TEST_P(HloEvaluatorTest, DoesAnd) {
 }
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise or with 2 operands.
-TEST_P(HloEvaluatorTest, DoesOr) {
+TEST_F(HloEvaluatorTest, DoesOr) {
   auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
   auto expected = LiteralUtil::CreateR2<int64>({{3, 4}, {-100, 4}});
@@ -222,7 +227,7 @@ TEST_P(HloEvaluatorTest, DoesOr) {
 }
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise or with 2 operands.
-TEST_P(HloEvaluatorTest, DoesXor) {
+TEST_F(HloEvaluatorTest, DoesXor) {
   auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
   auto expected = LiteralUtil::CreateR2<int64>({{3, 4}, {-104, 0}});
@@ -231,7 +236,7 @@ TEST_P(HloEvaluatorTest, DoesXor) {
 }
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise multiply with 2 operands.
-TEST_P(HloEvaluatorTest, DoesMultiply) {
+TEST_F(HloEvaluatorTest, DoesMultiply) {
   auto lhs = LiteralUtil::CreateR2<int32>({{-1, 0}, {-100, 4}});
   auto rhs = LiteralUtil::CreateR2<int32>(
       {{std::numeric_limits<int32>::min(), 4}, {4, 4}});
@@ -242,14 +247,14 @@ TEST_P(HloEvaluatorTest, DoesMultiply) {
 }
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise divide with 2 operands.
-TEST_P(HloEvaluatorTest, DoesDivideInt64) {
+TEST_F(HloEvaluatorTest, DoesDivideInt64) {
   auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
   auto expected = LiteralUtil::CreateR2<int64>({{0, 0}, {-25, 1}});
   TestBinaryOp(HloOpcode::kDivide, std::move(expected), std::move(lhs),
                std::move(rhs));
 }
-TEST_P(HloEvaluatorTest, DoesDivideDouble) {
+TEST_P(HloEvaluatorBf16Test, DoesDivideDouble) {
   auto lhs = LiteralUtil::CreateR2<double>({{1.0, 0.0}, {-100.0, 4.0}});
   auto rhs = LiteralUtil::CreateR2<double>({{2.2, 4.0}, {4.0, 4.0}});
   auto expected =
@@ -260,41 +265,41 @@ TEST_P(HloEvaluatorTest, DoesDivideDouble) {
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise abs op with 1 operand.
-TEST_P(HloEvaluatorTest, DoesAbsR2) {
+TEST_F(HloEvaluatorTest, DoesAbsR2) {
   auto operand = LiteralUtil::CreateR2<int64>({{1, -20}, {-100, 4}});
   auto expected = LiteralUtil::CreateR2<int64>({{1, 20}, {100, 4}});
   TestUnaryOp(HloOpcode::kAbs, std::move(expected), std::move(operand));
 }
-TEST_P(HloEvaluatorTest, DoesAbsR0) {
+TEST_P(HloEvaluatorBf16Test, DoesAbsR0) {
   auto operand = LiteralUtil::CreateR0<float>(-1.0f);
   auto expected = LiteralUtil::CreateR0<float>(1.0f);
   TestUnaryOp(HloOpcode::kAbs, std::move(expected), std::move(operand));
 }
-TEST_P(HloEvaluatorTest, DoesAbsR1WithZeroSize) {
+TEST_P(HloEvaluatorBf16Test, DoesAbsR1WithZeroSize) {
   auto operand = LiteralUtil::CreateR1<float>({});
   auto expected = LiteralUtil::CreateR1<float>({});
   TestUnaryOp(HloOpcode::kAbs, std::move(expected), std::move(operand));
 }
-TEST_P(HloEvaluatorTest, DoesNegateR2) {
+TEST_F(HloEvaluatorTest, DoesNegateR2) {
   auto operand = LiteralUtil::CreateR2<int32>(
       {{0, std::numeric_limits<int32>::min()}, {-1, 4}});
   auto expected = LiteralUtil::CreateR2<int32>(
       {{0, std::numeric_limits<int>::min()}, {1, -4}});
   TestUnaryOp(HloOpcode::kNegate, std::move(expected), std::move(operand));
 }
-TEST_P(HloEvaluatorTest, DoesCosR2) {
+TEST_P(HloEvaluatorBf16Test, DoesCosR2) {
   auto operand = LiteralUtil::CreateR2<float>({{0, M_PI}, {-M_PI, 2 * M_PI}});
   auto expected = LiteralUtil::CreateR2<float>({{1, -1}, {-1, 1}});
   TestUnaryOp(HloOpcode::kCos, std::move(expected), std::move(operand),
               use_bfloat16_ ? 0.031250 : 9.5367431640625E-7);
 }
-TEST_P(HloEvaluatorTest, DoesSinR2) {
+TEST_P(HloEvaluatorBf16Test, DoesSinR2) {
   auto operand = LiteralUtil::CreateR2<float>({{0, M_PI}, {-M_PI, 2 * M_PI}});
   auto expected = LiteralUtil::CreateR2<float>({{0, 0}, {0, 0}});
   TestUnaryOp(HloOpcode::kSin, std::move(expected), std::move(operand),
               use_bfloat16_ ? 0.031250 : 9.5367431640625E-7);
 }
-TEST_P(HloEvaluatorTest, DoesNotR2) {
+TEST_F(HloEvaluatorTest, DoesNotR2) {
   auto operand =
       LiteralUtil::CreateR2<int32>({{0, std::numeric_limits<int>::min()},
                                     {-1, std::numeric_limits<int>::max()}});
@@ -305,7 +310,7 @@ TEST_P(HloEvaluatorTest, DoesNotR2) {
 }
 // Verifies that HloEvaluator evaluates a HLO Computation with non-parameter nor
 // constant operands.
-TEST_P(HloEvaluatorTest, DoesTraverseInstructions) {
+TEST_F(HloEvaluatorTest, DoesTraverseInstructions) {
   auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
   auto rhs2 = LiteralUtil::CreateR2<int64>({{1, -20}, {-100, 4}});
@@ -335,7 +340,7 @@ TEST_P(HloEvaluatorTest, DoesTraverseInstructions) {
 }
 
 // Verifies Reshape operation is correctly evaluated.
-TEST_P(HloEvaluatorTest, DoesReshape) {
+TEST_F(HloEvaluatorTest, DoesReshape) {
   HloComputation::Builder b(TestName());
   const int64 dimensions[] = {11, 8, 7, 5, 9};
   TF_ASSERT_OK_AND_ASSIGN(auto literal,
@@ -361,7 +366,7 @@ TEST_P(HloEvaluatorTest, DoesReshape) {
 }
 
 // Verifies Broadcast operation is correctly evaluated.
-TEST_P(HloEvaluatorTest, DoesBroadcast) {
+TEST_F(HloEvaluatorTest, DoesBroadcast) {
   HloComputation::Builder b(TestName());
   auto input_literal = LiteralUtil::CreateR2<int32>({{1, 2}, {3, 4}, {5, 6}});
   auto output_literal = LiteralUtil::CreateR3<int32>(
@@ -377,7 +382,7 @@ TEST_P(HloEvaluatorTest, DoesBroadcast) {
   EXPECT_TRUE(LiteralTestUtil::Equal(result, output_literal));
 }
 
-TEST_P(HloEvaluatorTest, DoesBroadcastScalar) {
+TEST_F(HloEvaluatorTest, DoesBroadcastScalar) {
   HloComputation::Builder b(TestName());
   auto input_literal = LiteralUtil::CreateR0<int32>(111);
   auto output_literal = LiteralUtil::CreateR2<int32>(
@@ -396,7 +401,7 @@ TEST_P(HloEvaluatorTest, DoesBroadcastScalar) {
   EXPECT_TRUE(LiteralTestUtil::Equal(result, output_literal));
 }
 
-TEST_P(HloEvaluatorTest, DoesConcatenateSimple) {
+TEST_F(HloEvaluatorTest, DoesConcatenateSimple) {
   HloComputation::Builder b(TestName());
 
   HloInstruction* operand1 = b.AddInstruction(HloInstruction::CreateConstant(
@@ -418,7 +423,7 @@ TEST_P(HloEvaluatorTest, DoesConcatenateSimple) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, ConcatenateHandlesShapeWithZeroElement) {
+TEST_F(HloEvaluatorTest, ConcatenateHandlesShapeWithZeroElement) {
   HloComputation::Builder b(TestName());
 
   HloInstruction* operand1 = b.AddInstruction(
@@ -439,7 +444,7 @@ TEST_P(HloEvaluatorTest, ConcatenateHandlesShapeWithZeroElement) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, ConvertWithSameLayout) {
+TEST_P(HloEvaluatorBf16Test, ConvertWithSameLayout) {
   HloComputation::Builder b(TestName());
 
   auto input_literal = LiteralUtil::CreateR2<int32>({{1, 2}, {3, 4}, {5, 6}});
@@ -458,7 +463,7 @@ TEST_P(HloEvaluatorTest, ConvertWithSameLayout) {
   EXPECT_TRUE(LiteralTestUtil::Equal(result, expected));
 }
 
-TEST_P(HloEvaluatorTest, ConvertWithDifferentLayout) {
+TEST_P(HloEvaluatorBf16Test, ConvertWithDifferentLayout) {
   HloComputation::Builder b(TestName());
 
   auto input_literal = LiteralUtil::CreateR2WithLayout<int32>(
@@ -491,7 +496,7 @@ PaddingConfig CreatePaddingConfig(
   return padding_config;
 }
 
-TEST_P(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) {
+TEST_F(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) {
   auto operand = LiteralUtil::CreateR2<int32>({{}, {}});
   HloComputation::Builder b(TestName());
   auto operand_instruction =
@@ -516,7 +521,7 @@ TEST_P(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) {
+TEST_P(HloEvaluatorBf16Test, Pad4DFloatArrayWithInteriorPadding) {
   HloComputation::Builder b(TestName());
 
   Array4D<float> input_array(3, 2, 1, 1, {1, 2, 3, 4, 5, 6});
@@ -551,7 +556,7 @@ TEST_P(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, NegativePadding2D) {
+TEST_P(HloEvaluatorBf16Test, NegativePadding2D) {
   HloComputation::Builder b(TestName());
 
   // input_array:
@@ -593,7 +598,7 @@ TEST_P(HloEvaluatorTest, NegativePadding2D) {
   EXPECT_TRUE(LiteralTestUtil::Near(expected, result, ErrorSpec(0.031250)));
 }
 
-TEST_P(HloEvaluatorTest, NegativeAndInteriorPadding2D) {
+TEST_P(HloEvaluatorBf16Test, NegativeAndInteriorPadding2D) {
   HloComputation::Builder b(TestName());
 
   // f32[4,3] {
@@ -632,7 +637,7 @@ TEST_P(HloEvaluatorTest, NegativeAndInteriorPadding2D) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, DotRank2AndRank1) {
+TEST_P(HloEvaluatorBf16Test, DotRank2AndRank1) {
   HloComputation::Builder b(TestName());
 
   // lhs:
@@ -678,7 +683,7 @@ TEST_P(HloEvaluatorTest, DotRank2AndRank1) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, DotRank1AndRank2) {
+TEST_P(HloEvaluatorBf16Test, DotRank1AndRank2) {
   HloComputation::Builder b(TestName());
 
   // lhs:
@@ -716,7 +721,7 @@ TEST_P(HloEvaluatorTest, DotRank1AndRank2) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, DotRank2AndRank2) {
+TEST_P(HloEvaluatorBf16Test, DotRank2AndRank2) {
   HloComputation::Builder b(TestName());
 
   // lhs:
@@ -766,7 +771,7 @@ TEST_P(HloEvaluatorTest, DotRank2AndRank2) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, DotRank4AndRank4) {
+TEST_P(HloEvaluatorBf16Test, DotRank4AndRank4) {
   HloComputation::Builder b(TestName());
 
   auto lhs_array = absl::make_unique<Array4D<float>>(2, 2, 3, 1);
@@ -810,7 +815,7 @@ TEST_P(HloEvaluatorTest, DotRank4AndRank4) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, SimpleConv1D) {
+TEST_P(HloEvaluatorBf16Test, SimpleConv1D) {
   HloComputation::Builder b(TestName());
 
   Array3D<float> lhs_array = {{{1, 2, 3}}};
@@ -859,7 +864,7 @@ TEST_P(HloEvaluatorTest, SimpleConv1D) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) {
+TEST_P(HloEvaluatorBf16Test, Simple4x4Conv2DWith2x2Kernel) {
   HloComputation::Builder b(TestName());
 
   Array4D<float> lhs_array(1, 1, 4, 4);
@@ -922,7 +927,7 @@ TEST_P(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, Conv2DGeneralDimensionsReversed) {
+TEST_P(HloEvaluatorBf16Test, Conv2DGeneralDimensionsReversed) {
   HloComputation::Builder b(TestName());
 
   // clang-format off
@@ -1003,7 +1008,7 @@ TEST_P(HloEvaluatorTest, Conv2DGeneralDimensionsReversed) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, Conv2DGeneralDimensions) {
+TEST_P(HloEvaluatorBf16Test, Conv2DGeneralDimensions) {
   HloComputation::Builder b(TestName());
 
   // clang-format off
@@ -1081,7 +1086,7 @@ TEST_P(HloEvaluatorTest, Conv2DGeneralDimensions) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) {
+TEST_P(HloEvaluatorBf16Test, DilatedBaseConv2DWithHighPadding) {
   HloComputation::Builder b(TestName());
 
   Array4D<float> lhs_array(1, 1, 4, 4);
@@ -1145,7 +1150,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) {
+TEST_P(HloEvaluatorBf16Test, DilatedBaseConv2DWithLowAndHighPadding) {
   HloComputation::Builder b(TestName());
 
   Array4D<float> lhs_array(1, 1, 4, 4);
@@ -1210,7 +1215,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest,
+TEST_P(HloEvaluatorBf16Test,
        DilatedWindowAndBaseConv2DWithDifferentLowAndHighPaddingAndStrides) {
   HloComputation::Builder b(TestName());
 
@@ -1283,7 +1288,7 @@ TEST_P(HloEvaluatorTest,
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, Conv2DGroupedConvolution) {
+TEST_P(HloEvaluatorBf16Test, Conv2DGroupedConvolution) {
   HloComputation::Builder b(TestName());
   std::vector<int64> input_dims = {1, 2, 2, 4};
   std::vector<int64> filter_dims = {2, 2, 2, 8};
@@ -1419,7 +1424,7 @@ void BM_ReducePrecisely(int num_iters) {
 
 BENCHMARK(BM_ReducePrecisely);
 
-TEST_P(HloEvaluatorTest, ReduceAdd) {
+TEST_P(HloEvaluatorBf16Test, ReduceAdd) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1461,7 +1466,7 @@ TEST_P(HloEvaluatorTest, ReduceAdd) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, ReduceWindowMax) {
+TEST_P(HloEvaluatorBf16Test, ReduceWindowMax) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1512,7 +1517,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowMax) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, ReduceWindowMaxWindowDilation) {
+TEST_P(HloEvaluatorBf16Test, ReduceWindowMaxWindowDilation) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1564,7 +1569,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowMaxWindowDilation) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, ReduceWindowAdd) {
+TEST_P(HloEvaluatorBf16Test, ReduceWindowAdd) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1621,7 +1626,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, ReduceWindowAdd6D) {
+TEST_P(HloEvaluatorBf16Test, ReduceWindowAdd6D) {
   HloComputation::Builder b(TestName());
 
   // arg: f32[4,4,4,4,4,4] full of ones. Using small dims to limit run-time.
@@ -1684,7 +1689,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd6D) {
   EXPECT_TRUE(LiteralTestUtil::Equal(result_literal, result));
 }
 
-TEST_P(HloEvaluatorTest, StridedSlice) {
+TEST_P(HloEvaluatorBf16Test, StridedSlice) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1718,7 +1723,7 @@ TEST_P(HloEvaluatorTest, StridedSlice) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, DynamicSlice) {
+TEST_P(HloEvaluatorBf16Test, DynamicSlice) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1734,12 +1739,14 @@ TEST_P(HloEvaluatorTest, DynamicSlice) {
   HloInstruction* operand = b.AddInstruction(
       HloInstruction::CreateConstant(std::move(operand_literal)));
 
-  auto start_indices = b.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({0, 1})));
+  auto zero = b.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(0)));
+  auto one = b.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(1)));
 
   Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
-  b.AddInstruction(HloInstruction::CreateDynamicSlice(shape, operand,
-                                                      start_indices, {2, 3}));
+  b.AddInstruction(
+      HloInstruction::CreateDynamicSlice(shape, operand, {zero, one}, {2, 3}));
   m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
@@ -1754,7 +1761,7 @@ TEST_P(HloEvaluatorTest, DynamicSlice) {
 
 // Verifies that the HloEvaluator's implementation goes along with existing
 // backends' behavior, although this is not required by the spec.
-TEST_P(HloEvaluatorTest, DynamicSliceModSlice) {
+TEST_P(HloEvaluatorBf16Test, DynamicSliceModSlice) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1770,12 +1777,14 @@ TEST_P(HloEvaluatorTest, DynamicSliceModSlice) {
   HloInstruction* operand = b.AddInstruction(
       HloInstruction::CreateConstant(std::move(operand_literal)));
 
-  auto start_indices = b.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2, 1})));
+  auto two = b.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(2)));
+  auto one = b.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(1)));
 
   Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
-  b.AddInstruction(HloInstruction::CreateDynamicSlice(shape, operand,
-                                                      start_indices, {2, 3}));
+  b.AddInstruction(
+      HloInstruction::CreateDynamicSlice(shape, operand, {two, one}, {2, 3}));
   m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
@@ -1788,7 +1797,7 @@ TEST_P(HloEvaluatorTest, DynamicSliceModSlice) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, DynamicSliceUpdate) {
+TEST_P(HloEvaluatorBf16Test, DynamicSliceUpdate) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1804,15 +1813,17 @@ TEST_P(HloEvaluatorTest, DynamicSliceUpdate) {
   HloInstruction* operand = b.AddInstruction(
       HloInstruction::CreateConstant(std::move(operand_literal)));
 
-  auto start_indices = b.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int64>({0, 1})));
+  auto zero = b.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(0)));
+  auto one = b.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(1)));
 
   auto update = b.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<double>({{-2.0, -3.0}, {-6.0, -7.0}})));
 
   Shape shape = ShapeUtil::MakeShape(F64, {2, 3});
   b.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-      shape, operand, update, start_indices));
+      shape, operand, update, {zero, one}));
   m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
@@ -1825,7 +1836,7 @@ TEST_P(HloEvaluatorTest, DynamicSliceUpdate) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, SetAndGetTuples) {
+TEST_P(HloEvaluatorBf16Test, SetAndGetTuples) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1861,7 +1872,7 @@ TEST_P(HloEvaluatorTest, SetAndGetTuples) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, SetAndGetNestedTuples) {
+TEST_P(HloEvaluatorBf16Test, SetAndGetNestedTuples) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1900,7 +1911,7 @@ TEST_P(HloEvaluatorTest, SetAndGetNestedTuples) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, Reverse) {
+TEST_P(HloEvaluatorBf16Test, Reverse) {
   HloComputation::Builder b(TestName());
 
   // Input shape is float[4x3x2x1].
@@ -1953,7 +1964,7 @@ TEST_P(HloEvaluatorTest, Reverse) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateWithSubstitutions) {
+TEST_P(HloEvaluatorBf16Test, EvaluateWithSubstitutions) {
   HloComputation::Builder b(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4});
 
@@ -1977,7 +1988,7 @@ TEST_P(HloEvaluatorTest, EvaluateWithSubstitutions) {
 
 // Check that EvaluateWithSubstitutions works if one of the operands to the op
 // we're evaluating is a constant.
-TEST_P(HloEvaluatorTest, EvaluateWithSubstitutionsWithConstantOperand) {
+TEST_P(HloEvaluatorBf16Test, EvaluateWithSubstitutionsWithConstantOperand) {
   HloComputation::Builder b(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4});
 
@@ -2000,7 +2011,7 @@ TEST_P(HloEvaluatorTest, EvaluateWithSubstitutionsWithConstantOperand) {
       LiteralUtil::CreateR1<float>({11, 22, 33, 44}), result.ValueOrDie()));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherV1) {
+TEST_F(HloEvaluatorTest, EvaluateGather_TensorFlowGatherV1) {
   const char* hlo_text = R"(
 HloModule TensorFlowGatherV1
 
@@ -2024,7 +2035,7 @@ ENTRY main {
       Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherV2) {
+TEST_F(HloEvaluatorTest, EvaluateGather_TensorFlowGatherV2) {
   const char* hlo_text = R"(
 HloModule TensorFlowGatherV2
 
@@ -2048,7 +2059,7 @@ ENTRY main {
       Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherMultipleBatchDims) {
+TEST_F(HloEvaluatorTest, EvaluateGather_TensorFlowGatherMultipleBatchDims) {
   const char* hlo_text = R"(
 HloModule TensorFlowGatherMultipleBatchDims
 
@@ -2073,7 +2084,7 @@ ENTRY main {
       Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherNd) {
+TEST_F(HloEvaluatorTest, EvaluateGather_TensorFlowGatherNd) {
   const char* hlo_text = R"(
 HloModule TensorFlowGatherNd
 
@@ -2099,7 +2110,7 @@ ENTRY main {
                              Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest,
+TEST_F(HloEvaluatorTest,
        EvaluateGather_TensorFlowGatherNdNonDefaultIndexVectorDim) {
   const char* hlo_text = R"(
 HloModule TensorFlowGatherNd
@@ -2126,7 +2137,7 @@ ENTRY main {
                              Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateGather_DynamicSlice) {
+TEST_F(HloEvaluatorTest, EvaluateGather_DynamicSlice) {
   const char* hlo_text = R"(
 HloModule DynamicSlice
 
@@ -2149,7 +2160,7 @@ ENTRY main {
                                      Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateGather_BatchDynamicSlice) {
+TEST_F(HloEvaluatorTest, EvaluateGather_BatchDynamicSlice) {
   const char* hlo_text = R"(
 HloModule BatchDynamicSlice
 
@@ -2173,7 +2184,7 @@ ENTRY main {
                              Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateGather_ZeroDimBounds) {
+TEST_F(HloEvaluatorTest, EvaluateGather_ZeroDimBounds) {
   const char* hlo_text = R"(
 HloModule TensorFlowGatherV1
 
@@ -2195,7 +2206,7 @@ ENTRY main {
                                      Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateGather_NoOutputWindowDims) {
+TEST_F(HloEvaluatorTest, EvaluateGather_NoOutputWindowDims) {
   const string hlo_text = R"(
 HloModule GatherXd
 
@@ -2220,7 +2231,7 @@ ENTRY main {
                              Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterV1_Update) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterV1_Update) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatterV1
 
@@ -2251,7 +2262,7 @@ ENTRY main {
       Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterV2_Update) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterV2_Update) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatterV2
 
@@ -2283,7 +2294,7 @@ ENTRY main {
       Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_Add) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_Add) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatter
 
@@ -2315,7 +2326,7 @@ ENTRY main {
       Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_Mul) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_Mul) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatter
 
@@ -2347,7 +2358,7 @@ ENTRY main {
       Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_F32) {
+TEST_P(HloEvaluatorBf16Test, EvaluateScatter_TensorFlowScatter_F32) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatter
 
@@ -2381,7 +2392,7 @@ ENTRY main {
       Evaluate({&operand, &scatter_indices, &updates}), ErrorSpec{0.1, 0.01}));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_RepeatedIndices) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_RepeatedIndices) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatter
 
@@ -2413,7 +2424,7 @@ ENTRY main {
       Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_MultipleBatchDims) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_MultipleBatchDims) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatterMultipleBatchDims
 
@@ -2446,7 +2457,7 @@ ENTRY main {
       Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterNd) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterNd) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatterNd
 
@@ -2482,7 +2493,7 @@ ENTRY main {
       expected, Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest,
+TEST_F(HloEvaluatorTest,
        EvaluateScatter_TensorFlowScatterNd_NonDefaultIndexVectorDim) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatterNdNonDefaultIndexVectorDim
@@ -2519,7 +2530,7 @@ ENTRY main {
       expected, Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_DynamicUpdateSlice) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_DynamicUpdateSlice) {
   const char* hlo_text = R"(
 HloModule DynamicUpdateSlice
 
@@ -2551,7 +2562,7 @@ ENTRY main {
       expected, Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_BatchDynamicUpdateSlice) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_BatchDynamicUpdateSlice) {
   const char* hlo_text = R"(
 HloModule BatchDynamicUpdateSlice
 
@@ -2583,7 +2594,7 @@ ENTRY main {
       expected, Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_ZeroDimBounds) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_ZeroDimBounds) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatter_ZeroDimBounds
 
@@ -2612,7 +2623,7 @@ ENTRY main {
       operand, Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_NoUpdateWindowDims) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_NoUpdateWindowDims) {
   const string hlo_text = R"(
 HloModule Scatter_NoUpdateWindowDims
 
@@ -2645,7 +2656,7 @@ ENTRY main {
       expected, Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_NegativeIndices) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_NegativeIndices) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatter_NegativeIndices
 
@@ -2680,7 +2691,7 @@ ENTRY main {
                          {&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_OobIndices) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_OobIndices) {
   const string hlo_text = R"(
 HloModule BatchDynamicUpdateSlice
 
@@ -2716,7 +2727,7 @@ ENTRY main {
                          {&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_OobUpdateWindow) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_OobUpdateWindow) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatterNd_OobUpdateWindow
 
@@ -2755,7 +2766,7 @@ ENTRY main {
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise comparison with 2 bfloat16 operands.
-TEST_P(HloEvaluatorTest, DoesCompareBF16) {
+TEST_F(HloEvaluatorTest, DoesCompareBF16) {
   // lhs >= rhs
   auto lhs = LiteralUtil::CreateR2<bfloat16>(
       {{bfloat16(0.25), bfloat16(0.35), bfloat16(0.125)},
@@ -2769,7 +2780,7 @@ TEST_P(HloEvaluatorTest, DoesCompareBF16) {
                std::move(rhs));
 }
 
-TEST_P(HloEvaluatorTest, Bf16Reduction) {
+TEST_P(HloEvaluatorBf16Test, Bf16Reduction) {
   const string hlo_text = R"(
 HloModule Bf16Reduction
 
@@ -2793,7 +2804,7 @@ ENTRY main {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, Evaluate({&arg})));
 }
 
-TEST_P(HloEvaluatorTest, SliceWithDifferentLayout) {
+TEST_P(HloEvaluatorBf16Test, SliceWithDifferentLayout) {
   // Regression test for b/114735354.
   const string hlo_text = R"(
 HloModule SliceWithDifferentLayout
@@ -2812,7 +2823,7 @@ ENTRY main {
   EXPECT_TRUE(LiteralTestUtil::Equal(arg, actual));
 }
 
-TEST_P(HloEvaluatorTest, Bitcast) {
+TEST_P(HloEvaluatorBf16Test, Bitcast) {
   // Regression test for b/114735354.
   constexpr absl::string_view hlo_text_base = R"(
 HloModule Bitcast
@@ -2840,12 +2851,7 @@ ENTRY main {
 }
 
 // Check that s32 under/overflow doesn't trigger a ubsan failure.
-TEST_P(HloEvaluatorTest, Int32Overflow) {
-  // Test not applicable to bf16; only applies to signed integral types.
-  if (use_bfloat16_) {
-    return;
-  }
-
+TEST_F(HloEvaluatorTest, Int32Overflow) {
   constexpr absl::string_view hlo_text = R"(
 HloModule Test
 
@@ -2873,8 +2879,154 @@ ENTRY main {
             static_cast<int32>(pow31 * pow31));
 }
 
-INSTANTIATE_TEST_CASE_P(HloEvaluatorTest_Instantiation, HloEvaluatorTest,
-                        ::testing::ValuesIn(use_bf16_params));
+TEST_F(HloEvaluatorTest, GetDimensionSize) {
+  constexpr absl::string_view hlo_text = R"(
+HloModule Test
+
+ENTRY main {
+  size = u32[] parameter(0)
+
+  data = s32[4] parameter(1)
+
+  sum = s32[4] add(data, data)
+
+  ROOT dynamic_size = u32[] get-dimension-size(sum), dimensions={0}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(m_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{0, {}},
+      DynamicParameterBinding::DynamicDimension{1, {}, 0}));
+
+  TF_ASSERT_OK_AND_ASSIGN(DynamicDimensionInference dynamic_dimension_inference,
+                          DynamicDimensionInference::Run(m_.get()));
+
+  evaluator_.set_dynamic_dimension_inference(&dynamic_dimension_inference);
+  Literal size_arg = LiteralUtil::CreateR0<uint32>(3);
+  Literal data_arg = LiteralUtil::CreateR1<int32>({1, 2, 3, 4});
+
+  Literal actual = Evaluate({&size_arg, &data_arg});
+
+  EXPECT_EQ(actual.GetFirstElement<uint32>(), static_cast<uint32>(3));
+}
+
+// Check that we get a useful error if we pass inputs of the wrong shape.
+TEST_F(HloEvaluatorTest, EvaluateWithWrongInputShapes) {
+  constexpr absl::string_view hlo_text = R"(
+HloModule Test
+
+ENTRY main {
+  p0 = s32[1] parameter(0)
+  ROOT sum = s32[1] add(p0, p0)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  Literal input_wrong_shape = LiteralUtil::CreateR1<int32>({0, 1});
+
+  EXPECT_EQ(HloEvaluator()
+                .Evaluate(*m_, {&input_wrong_shape})
+                .status()
+                .error_message(),
+            "Shape mismatch at parameter 0. Computation expected s32[1]{0}, "
+            "but arg was s32[2].");
+  EXPECT_EQ(HloEvaluator()
+                .Evaluate(*m_->entry_computation(), {&input_wrong_shape})
+                .status()
+                .error_message(),
+            "Shape mismatch at parameter 0. Computation expected s32[1]{0}, "
+            "but arg was s32[2].");
+}
+
+// Check that we get a useful error if we pass too many or too few inputs.
+TEST_F(HloEvaluatorTest, EvaluateWithWrongNumberOfInputs) {
+  constexpr absl::string_view hlo_text = R"(
+HloModule Test
+
+ENTRY main {
+  p0 = s32[1] parameter(0)
+  ROOT sum = s32[1] add(p0, p0)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  Literal input = LiteralUtil::CreateR1<int32>({0});
+
+  EXPECT_EQ(
+      HloEvaluator().Evaluate(*m_, {&input, &input}).status().error_message(),
+      "Expected 1 argument, but got 2.");
+  EXPECT_EQ(HloEvaluator()
+                .Evaluate(*m_->entry_computation(), {&input, &input})
+                .status()
+                .error_message(),
+            "Expected 1 argument, but got 2.");
+}
+
+TEST_F(HloEvaluatorTest, PreserveFusionInputLayout) {
+  constexpr absl::string_view hlo_text = R"(
+    HloModule FusionInputLayout
+
+    fused_computation {
+      param_0 = f32[20,20]{0,1} parameter(0)
+      ROOT bitcast = f32[20,20]{1,0} bitcast(param_0)
+    }
+
+    ENTRY kernel_entry {
+      parameter.0 = f32[20,20]{0,1} parameter(0)
+      ROOT fusion = f32[20,20]{1,0} fusion(parameter.0),
+        kind=kLoop, calls=fused_computation
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
+  Literal actual = Evaluate({&args[0]});
+  EXPECT_TRUE(absl::c_equal(args[0].data<float>(), actual.data<float>()));
+}
+
+TEST_F(HloEvaluatorTest, PreserveFusionOutputLayout) {
+  constexpr absl::string_view hlo_text = R"(
+    HloModule FusionOutputLayout
+
+    fused_computation {
+      param_0 = f32[20,20]{1,0} parameter(0)
+      ROOT bitcast = f32[20,20]{0,1} bitcast(param_0)
+    }
+
+    ENTRY kernel_entry {
+      parameter.0 = f32[20,20]{1,0} parameter(0)
+      ROOT fusion = f32[20,20]{0,1} fusion(parameter.0),
+        kind=kLoop, calls=fused_computation
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
+  Literal actual = Evaluate({&args[0]});
+  EXPECT_TRUE(absl::c_equal(args[0].data<float>(), actual.data<float>()));
+}
+
+TEST_F(HloEvaluatorTest, PreserveMOFusionOutputLayout) {
+  constexpr absl::string_view hlo_text = R"(
+    HloModule MOFusionOutputLayout
+
+    fused_computation {
+      param_0 = f32[20,20]{1,0} parameter(0)
+      bitcast = f32[20,20]{0,1} bitcast(param_0)
+      ROOT tuple = (f32[20,20]{0,1}) tuple(bitcast)
+    }
+
+    ENTRY kernel_entry {
+      parameter.0 = f32[20,20]{1,0} parameter(0)
+      ROOT fusion = (f32[20,20]{0,1}) fusion(parameter.0),
+        kind=kLoop, calls=fused_computation
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
+  Literal actual_tuple = Evaluate({&args[0]});
+  std::vector<Literal> actual_literals = actual_tuple.DecomposeTuple();
+  EXPECT_TRUE(
+      absl::c_equal(args[0].data<float>(), actual_literals[0].data<float>()));
+}
 
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index f95a3c4ef9a88198722d54c8a9a4ef4017d23a2c..698b177310476f4a5bca11b423525eb20e7fcf98 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -917,7 +917,11 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   Status HandleClamp(HloInstruction* clamp) {
     std::function<ElementwiseT(ElementwiseT, ElementwiseT, ElementwiseT)>
         clamp_op = [](ElementwiseT low, ElementwiseT value, ElementwiseT high) {
-          return std::fmin(high, std::fmax(value, low));
+          if (std::isnan(low) || std::isnan(high)) {
+            return static_cast<ElementwiseT>(NAN);
+          }
+          return static_cast<ElementwiseT>(
+              std::fmin(high, std::fmax(value, low)));
         };
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[clamp],
@@ -1406,10 +1410,12 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     auto operand = dynamic_slice->operand(0);
     auto start_indices = dynamic_slice->operand(1);
     auto result_shape = dynamic_slice->shape();
-    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
-                        ShapeInference::InferDynamicSliceShape(
-                            operand->shape(), start_indices->shape(),
-                            dynamic_slice->dynamic_slice_sizes()));
+    TF_ASSIGN_OR_RETURN(
+        auto inferred_return_shape,
+        ShapeInference::InferDynamicSliceShape(
+            operand->shape(),
+            Cast<HloDynamicSliceInstruction>(dynamic_slice)->index_shapes(),
+            dynamic_slice->dynamic_slice_sizes()));
     TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
         << "return shape is set to: " << ShapeUtil::HumanString(result_shape)
         << " but is inferred to be: "
@@ -1418,33 +1424,39 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
         primitive_util::IsIntegralType(start_indices->shape().element_type()));
 
     const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
-    const Literal& start_indices_literal =
-        parent_->GetEvaluatedLiteralFor(start_indices);
 
     switch (start_indices->shape().element_type()) {
       case S32: {
         TF_ASSIGN_OR_RETURN(
             parent_->evaluated_[dynamic_slice],
-            DynamicSlice<int32>(operand_literal, start_indices_literal,
-                                result_shape));
+            DynamicSlice<int32>(
+                operand_literal,
+                absl::MakeConstSpan(dynamic_slice->operands()).subspan(1),
+                result_shape));
       } break;
       case S64: {
         TF_ASSIGN_OR_RETURN(
             parent_->evaluated_[dynamic_slice],
-            DynamicSlice<int64>(operand_literal, start_indices_literal,
-                                result_shape));
+            DynamicSlice<int64>(
+                operand_literal,
+                absl::MakeConstSpan(dynamic_slice->operands()).subspan(1),
+                result_shape));
       } break;
       case U32: {
         TF_ASSIGN_OR_RETURN(
             parent_->evaluated_[dynamic_slice],
-            DynamicSlice<uint32>(operand_literal, start_indices_literal,
-                                 result_shape));
+            DynamicSlice<uint32>(
+                operand_literal,
+                absl::MakeConstSpan(dynamic_slice->operands()).subspan(1),
+                result_shape));
       } break;
       case U64: {
         TF_ASSIGN_OR_RETURN(
             parent_->evaluated_[dynamic_slice],
-            DynamicSlice<uint64>(operand_literal, start_indices_literal,
-                                 result_shape));
+            DynamicSlice<uint64>(
+                operand_literal,
+                absl::MakeConstSpan(dynamic_slice->operands()).subspan(1),
+                result_shape));
       } break;
       default:
         LOG(FATAL) << "HandleDynamicSlice: unhandled primitive type for "
@@ -1464,7 +1476,9 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     TF_ASSIGN_OR_RETURN(
         auto inferred_return_shape,
         ShapeInference::InferDynamicUpdateSliceShape(
-            operand->shape(), update->shape(), start_indices->shape()));
+            operand->shape(), update->shape(),
+            Cast<HloDynamicUpdateSliceInstruction>(dynamic_update_slice)
+                ->index_shapes()));
     TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
         << "return shape is set to: " << ShapeUtil::HumanString(result_shape)
         << " but is inferred to be: "
@@ -1475,33 +1489,39 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
     const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
     const Literal& update_literal = parent_->GetEvaluatedLiteralFor(update);
-    const Literal& start_indices_literal =
-        parent_->GetEvaluatedLiteralFor(start_indices);
 
     switch (start_indices->shape().element_type()) {
       case S32: {
         TF_ASSIGN_OR_RETURN(
             parent_->evaluated_[dynamic_update_slice],
-            DynamicUpdateSlice<int32>(operand_literal, update_literal,
-                                      start_indices_literal));
+            DynamicUpdateSlice<int32>(
+                operand_literal, update_literal,
+                absl::MakeConstSpan(dynamic_update_slice->operands())
+                    .subspan(2)));
       } break;
       case S64: {
         TF_ASSIGN_OR_RETURN(
             parent_->evaluated_[dynamic_update_slice],
-            DynamicUpdateSlice<int64>(operand_literal, update_literal,
-                                      start_indices_literal));
+            DynamicUpdateSlice<int64>(
+                operand_literal, update_literal,
+                absl::MakeConstSpan(dynamic_update_slice->operands())
+                    .subspan(2)));
       } break;
       case U32: {
         TF_ASSIGN_OR_RETURN(
             parent_->evaluated_[dynamic_update_slice],
-            DynamicUpdateSlice<uint32>(operand_literal, update_literal,
-                                       start_indices_literal));
+            DynamicUpdateSlice<uint32>(
+                operand_literal, update_literal,
+                absl::MakeConstSpan(dynamic_update_slice->operands())
+                    .subspan(2)));
       } break;
       case U64: {
         TF_ASSIGN_OR_RETURN(
             parent_->evaluated_[dynamic_update_slice],
-            DynamicUpdateSlice<uint64>(operand_literal, update_literal,
-                                       start_indices_literal));
+            DynamicUpdateSlice<uint64>(
+                operand_literal, update_literal,
+                absl::MakeConstSpan(dynamic_update_slice->operands())
+                    .subspan(2)));
       } break;
       default:
         LOG(FATAL) << "HandleDynamicUpdateSlice: unhandled primitive type for "
@@ -1538,7 +1558,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
           }
 
           Literal computed_result =
-              embedded_evaluator.Evaluate<Literal>(*computation, arg_literals)
+              embedded_evaluator.Evaluate(*computation, arg_literals)
                   .ConsumeValueOrDie();
           // Clear visit states so that the we can use the evaluate again on
           // the same computation.
@@ -1635,8 +1655,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
           // Extract a slice from the literal that corresponds to exactly the
           // row in dimension 'sort_dim'.
           std::vector<int64> limit_indices(indices.begin(), indices.end());
-          std::for_each(limit_indices.begin(), limit_indices.end(),
-                        [](int64& index) { ++index; });
+          absl::c_for_each(limit_indices, [](int64& index) { ++index; });
           limit_indices[sort_dim] = sort_dim_elements;
           TF_ASSIGN_OR_RETURN(auto row_to_sort,
                               keys_literal.Slice(indices, limit_indices)
@@ -1799,7 +1818,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
                              [](Literal& literal) { return &literal; });
 
               TF_ASSIGN_OR_RETURN(Literal computed_result,
-                                  embedded_evaluator.Evaluate<const Literal*>(
+                                  embedded_evaluator.Evaluate(
                                       *function, embedded_operands_ptrs));
               // Clear visit states so that we can use the evaluator again on
               // the same computation.
@@ -1915,8 +1934,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
             selected_val_literal.Set({}, *selected_val);
             Literal computed_result =
                 embedded_evaluator
-                    .Evaluate<const Literal*>(
-                        *select, {&selected_val_literal, &curr_val_literal})
+                    .Evaluate(*select,
+                              {&selected_val_literal, &curr_val_literal})
                     .ConsumeValueOrDie();
             bool selected = !computed_result.Get<bool>({});
             if (selected) {
@@ -1937,9 +1956,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
               scattered_literal.Set({}, scattered);
               Literal computed_result =
                   embedded_evaluator
-                      .Evaluate<const Literal*>(
-                          *scatter,
-                          {&source_literal_scatter, &scattered_literal})
+                      .Evaluate(*scatter,
+                                {&source_literal_scatter, &scattered_literal})
                       .ConsumeValueOrDie();
               result.Set(operand_index, computed_result.Get<ReturnT>({}));
               // Clear visit states so that the we can use the evaluator again
@@ -2013,8 +2031,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
                     LiteralUtil::CreateR0<ReturnT>(result_val);
                 Literal computed_result =
                     embedded_evaluator
-                        .Evaluate<const Literal*>(
-                            *function, {&result_val_literal, &curr_val_literal})
+                        .Evaluate(*function,
+                                  {&result_val_literal, &curr_val_literal})
                         .ConsumeValueOrDie();
 
                 // Clear visit states so that the we can use the evaluate again
@@ -2376,9 +2394,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
           LiteralUtil::CreateR0<ReturnT>(updates.Get<ReturnT>(update_index));
       Literal updated_result =
           embedded_evaluator
-              .Evaluate<const Literal*>(
-                  *scatter->to_apply(),
-                  {&result_value_literal, &update_value_literal})
+              .Evaluate(*scatter->to_apply(),
+                        {&result_value_literal, &update_value_literal})
               .ConsumeValueOrDie();
       // Clear visit states so that the we can use the evaluate again on the
       // same computation.
@@ -2617,7 +2634,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   template <typename NativeT, typename std::enable_if<std::is_same<
                                   double, NativeT>::value>::type* = nullptr>
   Status HandleReducePrecision(HloInstruction* reduce_precision) {
-    return InvalidArgument("Double not supported for reduce precision");
+    return InvalidArgument("Double is not supported for reduce precision");
   }
 
   template <
@@ -2632,12 +2649,13 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return HandleReducePrecision<ElementwiseT>(reduce_precision);
   }
 
-  template <typename NativeT,
-            typename std::enable_if<
-                std::is_same<NativeT, bfloat16>::value ||
-                std::is_same<NativeT, Eigen::half>::value ||
-                std::is_integral<NativeT>::value ||
-                std::is_floating_point<NativeT>::value>::type* = nullptr>
+  template <
+      typename NativeT,
+      typename std::enable_if<
+          std::is_same<NativeT, bfloat16>::value ||
+          std::is_same<NativeT, Eigen::half>::value ||
+          std::is_integral<NativeT>::value || is_complex_t<NativeT>::value ||
+          std::is_floating_point<NativeT>::value>::type* = nullptr>
   Status HandleIota(HloInstruction* instruction) {
     auto* iota = Cast<HloIotaInstruction>(instruction);
     const int64 iota_size = iota->shape().dimensions(iota->iota_dimension());
@@ -2668,12 +2686,13 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
     return Status::OK();
   }
-  template <typename NativeT,
-            typename std::enable_if<
-                !(std::is_same<NativeT, bfloat16>::value ||
-                  std::is_same<NativeT, Eigen::half>::value ||
-                  std::is_integral<NativeT>::value ||
-                  std::is_floating_point<NativeT>::value)>::type* = nullptr>
+  template <
+      typename NativeT,
+      typename std::enable_if<
+          !(std::is_same<NativeT, bfloat16>::value ||
+            std::is_same<NativeT, Eigen::half>::value ||
+            std::is_integral<NativeT>::value || is_complex_t<NativeT>::value ||
+            std::is_floating_point<NativeT>::value)>::type* = nullptr>
   Status HandleIota(HloInstruction* iota) {
     return UnsupportedTypeError(iota);
   }
@@ -2681,6 +2700,103 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return HandleIota<ReturnT>(iota);
   }
 
+  template <typename NativeT,
+            typename std::enable_if<
+                !(std::is_integral<NativeT>::value ||
+                  std::is_floating_point<NativeT>::value)>::type* = nullptr>
+  Status HandleRng(HloInstruction* random) {
+    return UnsupportedTypeError(random);
+  }
+  template <typename NativeT,
+            typename std::enable_if<
+                (std::is_floating_point<NativeT>::value)>::type* = nullptr>
+  Status HandleRng(HloInstruction* random) {
+    RandomDistribution distribution = random->random_distribution();
+    const auto result_shape = random->shape();
+    Literal result(result_shape);
+
+    switch (distribution) {
+      case RNG_UNIFORM: {
+        const Literal& low =
+            parent_->GetEvaluatedLiteralFor(random->operand(0));
+        const Literal& high =
+            parent_->GetEvaluatedLiteralFor(random->operand(1));
+
+        std::uniform_real_distribution<NativeT> generator(
+            low.Get<NativeT>({}), high.Get<NativeT>({}));
+
+        TF_RETURN_IF_ERROR(
+            result.Populate<NativeT>([&](absl::Span<const int64> /*indexes*/) {
+              return generator(parent_->engine_);
+            }));
+        break;
+      }
+      case RNG_NORMAL: {
+        const Literal& mean =
+            parent_->GetEvaluatedLiteralFor(random->operand(0));
+        const Literal& stddev =
+            parent_->GetEvaluatedLiteralFor(random->operand(1));
+
+        std::normal_distribution<NativeT> generator(mean.Get<NativeT>({}),
+                                                    stddev.Get<NativeT>({}));
+
+        TF_RETURN_IF_ERROR(
+            result.Populate<NativeT>([&](absl::Span<const int64> /*indexes*/) {
+              return generator(parent_->engine_);
+            }));
+        break;
+      }
+      default:
+        return UnimplementedStrCat("The distribution ",
+                                   RandomDistribution_Name(distribution),
+                                   " is not implemented.");
+    }
+    parent_->evaluated_[random] = std::move(result);
+    return Status::OK();
+  }
+  template <typename NativeT,
+            typename std::enable_if<(std::is_integral<NativeT>::value)>::type* =
+                nullptr>
+  Status HandleRng(HloInstruction* random) {
+    RandomDistribution distribution = random->random_distribution();
+    const auto result_shape = random->shape();
+    Literal result(result_shape);
+
+    switch (distribution) {
+      case RNG_UNIFORM: {
+        const Literal& low =
+            parent_->GetEvaluatedLiteralFor(random->operand(0));
+        const Literal& high =
+            parent_->GetEvaluatedLiteralFor(random->operand(1));
+
+        // Note std::uniform_int_distribution assumes interval is closed, i.e.,
+        // [low, high], but we want [low, high) instead. Hence high-1 is used as
+        // the upper range.
+        std::uniform_int_distribution<int64> generator(
+            low.Get<NativeT>({}), high.Get<NativeT>({}) - 1);
+
+        TF_RETURN_IF_ERROR(
+            result.Populate<NativeT>([&](absl::Span<const int64> /*indexes*/) {
+              return static_cast<NativeT>(generator(parent_->engine_));
+            }));
+        break;
+      }
+      case RNG_NORMAL: {
+        return Unimplemented(
+            "Normal distribution is not supported for integral types.");
+      }
+      default:
+        return UnimplementedStrCat("The distribution ",
+                                   RandomDistribution_Name(distribution),
+                                   " is not implemented.");
+    }
+    parent_->evaluated_[random] = std::move(result);
+    return Status::OK();
+  }
+  Status HandleRng(HloInstruction* random) override {
+    return HandleRng<ReturnT>(random);
+  }
+
  private:
   // Creates a vector of multipliers which can be used to create a linear index
   // into shape.
@@ -2740,12 +2856,27 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   template <typename IndexT>
-  StatusOr<Literal> DynamicSlice(const Literal& operand_literal,
-                                 const Literal& start_indices_literal,
-                                 const Shape& result_shape) {
-    auto start_indices_typed = start_indices_literal.data<IndexT>();
-    std::vector<int64> start(start_indices_typed.begin(),
-                             start_indices_typed.end());
+  StatusOr<Literal> DynamicSlice(
+      const Literal& operand_literal,
+      absl::Span<HloInstruction* const> start_indices,
+      const Shape& result_shape) {
+    std::vector<int64> start;
+    // TODO(b/118437727): Remove the R1 code-path. Note that to distinguish
+    // between the cases, this currently assumes there is at least 1 index. That
+    // is wrong in the general case, because for scalar indices, if the operand
+    // is scalar, then there are no indices. This problem with resolve itself.
+    const HloInstruction* first_index = start_indices[0];
+    if (first_index->shape().rank() == 1) {
+      auto start_indices_typed =
+          parent_->GetEvaluatedLiteralFor(first_index).data<IndexT>();
+      start = std::vector<int64>(start_indices_typed.begin(),
+                                 start_indices_typed.end());
+    } else {
+      for (HloInstruction* index : start_indices) {
+        start.push_back(
+            parent_->GetEvaluatedLiteralFor(index).GetFirstElement<IndexT>());
+      }
+    }
 
     // Clamp the start indices so the slice is in-bounds w.r.t the operand.
     for (int64 i = 0; i < start.size(); ++i) {
@@ -2771,14 +2902,28 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   template <typename IndexT>
-  StatusOr<Literal> DynamicUpdateSlice(const Literal& operand_literal,
-                                       const Literal& update_literal,
-                                       const Literal& start_indices_literal) {
+  StatusOr<Literal> DynamicUpdateSlice(
+      const Literal& operand_literal, const Literal& update_literal,
+      absl::Span<HloInstruction* const> start_indices) {
     auto result = operand_literal.Clone();
-    auto start_indices_typed = start_indices_literal.data<IndexT>();
     const auto rank = result.shape().rank();
-    std::vector<int64> start(start_indices_typed.begin(),
-                             start_indices_typed.end());
+    std::vector<int64> start;
+    // TODO(b/118437727): Remove the R1 code-path. Note that to distinguish
+    // between the cases, this currently assumes there is at least 1 index. That
+    // is wrong in the general case, because for scalar indices, if the operand
+    // is scalar, then there are no indices. This problem with resolve itself.
+    const HloInstruction* first_index = start_indices[0];
+    if (first_index->shape().rank() == 1) {
+      auto start_indices_typed =
+          parent_->GetEvaluatedLiteralFor(first_index).data<IndexT>();
+      start = std::vector<int64>(start_indices_typed.begin(),
+                                 start_indices_typed.end());
+    } else {
+      for (HloInstruction* index : start_indices) {
+        start.push_back(
+            parent_->GetEvaluatedLiteralFor(index).GetFirstElement<IndexT>());
+      }
+    }
     // Clamp the update start indices so the slice is in-bounds w.r.t the
     // operand.
     for (int64 i = 0; i < rank; ++i) {
diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
index c919dbd82d3668c477bf37074f1d56f8cb7d9506..862b2029718bbd802b69d789b66683a4edfa2367 100644
--- a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
+++ b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
@@ -25,7 +26,9 @@ namespace xla {
 
 namespace {
 
-StatusOr<bool> ReplaceGetSize(HloInstruction* instr) {
+StatusOr<bool> ReplaceGetSize(
+    HloInstruction* instr,
+    const DynamicDimensionInference* dynamic_dimension_inference) {
   if (instr->opcode() != HloOpcode::kGetDimensionSize) {
     return false;
   }
@@ -36,10 +39,18 @@ StatusOr<bool> ReplaceGetSize(HloInstruction* instr) {
                           instr->operand(0)->shape(), instr->dimension()));
   TF_RET_CHECK(ShapeUtil::Equal(instr->shape(), legal_shape));
   TF_RET_CHECK(ShapeUtil::HasPrimitiveType(instr->shape(), U32));
-  uint32 size = instr->operand(0)->shape().dimensions(instr->dimension());
-  HloInstruction* new_instr = computation->AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(size)));
-  TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(new_instr));
+  HloInstruction* operand = instr->mutable_operand(0);
+  int64 dim = instr->dimension();
+  HloInstruction* dynamic_size =
+      dynamic_dimension_inference->GetDynamicSize(operand, {}, dim);
+  if (dynamic_size != nullptr) {
+    TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(dynamic_size));
+  } else {
+    uint32 size = instr->operand(0)->shape().dimensions(dim);
+    HloInstruction* new_instr = computation->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(size)));
+    TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(new_instr));
+  }
   return true;
 }
 
@@ -48,10 +59,13 @@ StatusOr<bool> ReplaceGetSize(HloInstruction* instr) {
 StatusOr<bool> HloGetDimensionSizeRewriter::Run(HloModule* module) {
   bool changed = false;
   HloProto proto;
+  TF_ASSIGN_OR_RETURN(DynamicDimensionInference inference,
+                      DynamicDimensionInference::Run(module));
   *proto.mutable_hlo_module() = module->ToProto();
   for (auto* computation : module->computations()) {
     for (auto instruction : computation->instructions()) {
-      TF_ASSIGN_OR_RETURN(bool replaced, ReplaceGetSize(instruction));
+      TF_ASSIGN_OR_RETURN(bool replaced,
+                          ReplaceGetSize(instruction, &inference));
       changed = changed || replaced;
     }
   }
diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h
index 30f44c23a835b3bcc935caaa917e040e07c4e703..9aa79fe66b665c48ec871c4188e44ba2056de3ad 100644
--- a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h
+++ b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h
@@ -21,7 +21,9 @@ limitations under the License.
 
 namespace xla {
 
-// Pass to replace a kGetDimensionSize instruction with a constant instruction.
+// Pass to replace a kGetDimensionSize instruction with a hlo instruction
+// representing the dynamic size if the dimension is dynamic, otherwise a
+// constant instruction representing the static size.
 class HloGetDimensionSizeRewriter : public HloModulePass {
  public:
   absl::string_view name() const override {
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index c6eaead8dd9c3dbf19bce34989e8b1f0f60468cc..4c7f5e9e7dfb12a8cb699bdf397eab21983342a1 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -24,9 +24,9 @@ limitations under the License.
 #include <queue>
 #include <string>
 #include <tuple>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -380,7 +380,7 @@ class HloDotDumper {
   // Each HloInstruction dumped gets a monotically-increasing node ID.  This
   // must start at 1, because that's where graphviz's accounting starts.
   int64 next_node_id_ = 1;
-  std::unordered_map<const HloInstruction*, int64> node_ids_;
+  absl::flat_hash_map<const HloInstruction*, int64> node_ids_;
 
   // The "root" tag doesn't have an associated HloInstruction pointer, so we
   // need to store it outside the map.
@@ -397,7 +397,7 @@ class HloDotDumper {
 
   // Each HloComputation that's emitted gets a monotonically-increasing ID.
   int64 next_cluster_id_ = 1;
-  std::unordered_map<const HloComputation*, int64> cluster_ids_;
+  absl::flat_hash_map<const HloComputation*, int64> cluster_ids_;
 
   // Edges to print from Footer().  Edges come at the end because graphviz is
   // unhappy if an edge from a subcomputation to a node in the outer computation
@@ -407,7 +407,7 @@ class HloDotDumper {
 
   // When coloring by sharding information, we track the sharding string
   // representation to color association, by round-robin the color schemes.
-  std::unordered_map<HloSharding, ColorScheme, HloSharding::Hasher>
+  absl::flat_hash_map<HloSharding, ColorScheme, HloSharding::Hasher>
       sharding_colors_;
   int64 next_shard_color_ = 0;
 };
@@ -561,8 +561,8 @@ bool HloDotDumper::ShouldShowSubcomputation(const HloComputation* subcomp) {
   }
 
   // Show the subcomputation if we're showing any of its members.
-  return std::any_of(
-      subcomp->instructions().begin(), subcomp->instructions().end(),
+  return absl::c_any_of(
+      subcomp->instructions(),
       [&](const HloInstruction* instr) { return filter_.Show(instr); });
 }
 
@@ -735,15 +735,14 @@ bool HloDotDumper::ShouldMergeIntoUsers(const HloInstruction* instr) const {
   const int kMinUsersToOmit = 3;
   return instr->opcode() == HloOpcode::kParameter && instr->shape().IsTuple() &&
          !instr->IsFused() &&
-         std::count_if(instr->users().begin(), instr->users().end(),
-                       [&](const HloInstruction* user) {
-                         return filter_.Show(user);
-                       }) > kMinUsersToOmit &&
-         std::all_of(instr->users().begin(), instr->users().end(),
-                     [&](const HloInstruction* user) {
-                       return !filter_.Show(user) ||
-                              user->opcode() == HloOpcode::kGetTupleElement;
-                     });
+         absl::c_count_if(instr->users(),
+                          [&](const HloInstruction* user) {
+                            return filter_.Show(user);
+                          }) > kMinUsersToOmit &&
+         absl::c_all_of(instr->users(), [&](const HloInstruction* user) {
+           return !filter_.Show(user) ||
+                  user->opcode() == HloOpcode::kGetTupleElement;
+         });
 }
 
 string HloDotDumper::DumpInstruction(const HloInstruction* instr) {
@@ -900,12 +899,11 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
   // the same color as a parameter.  Unless the merged-in parameter is a
   // parameter to a fusion node that is bound to a constant -- these aren't
   // "real" parameters from the user's perspective.
-  if (std::any_of(instr->operands().begin(), instr->operands().end(),
-                  [&](const HloInstruction* operand) {
-                    return operand->opcode() == HloOpcode::kParameter &&
-                           ShouldMergeIntoUsers(operand) &&
-                           TryGetFusionParameterConstant(operand) == nullptr;
-                  })) {
+  if (absl::c_any_of(instr->operands(), [&](const HloInstruction* operand) {
+        return operand->opcode() == HloOpcode::kParameter &&
+               ShouldMergeIntoUsers(operand) &&
+               TryGetFusionParameterConstant(operand) == nullptr;
+      })) {
     return parameter_color;
   }
 
@@ -1286,7 +1284,7 @@ NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
                                       int64 radius) {
   // First, find the neighborhood of nodes with distance from root <= radius.
   // These nodes are our initial set of "normal" nodes.
-  std::unordered_map<const HloInstruction*, NodeFilterResult> nodes;
+  absl::flat_hash_map<const HloInstruction*, NodeFilterResult> nodes;
   std::deque<std::pair<const HloInstruction*, /*depth*/ int64>> worklist;
   worklist.push_back({root, 0});
   while (!worklist.empty()) {
@@ -1307,7 +1305,7 @@ NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
     // are not interesting to the graph at hand.
     if (instr == root || instr->opcode() != HloOpcode::kTuple) {
       for (const HloInstruction* operand : instr->operands()) {
-        if (!nodes.count(operand)) {
+        if (!nodes.contains(operand)) {
           worklist.push_back({operand, depth + 1});
         }
       }
@@ -1335,7 +1333,7 @@ NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
       continue;
     }
     for (const HloInstruction* user : instr->users()) {
-      if (!nodes.count(user)) {
+      if (!nodes.contains(user)) {
         worklist.push_back({user, depth + 1});
       }
     }
@@ -1344,7 +1342,7 @@ NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
   auto is_displayed = [&](const HloInstruction* instr) {
     // Constants are displayed inline with their users; they're never omitted.
     // Nodes in subcomputations are always shown.
-    return nodes.count(instr) > 0 || instr->opcode() == HloOpcode::kConstant ||
+    return nodes.contains(instr) || instr->opcode() == HloOpcode::kConstant ||
            instr->parent() != root->parent();
   };
 
@@ -1355,12 +1353,11 @@ NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
     NodeFilterResult& filter_result = kv.second;
     const auto& operands = instr->operands();
 
-    if (std::any_of(operands.begin(), operands.end(), is_displayed) &&
-        !std::all_of(operands.begin(), operands.end(), is_displayed)) {
+    if (absl::c_any_of(operands, is_displayed) &&
+        !absl::c_all_of(operands, is_displayed)) {
       // Mark nodes with some operands omitted appropriately.
       filter_result = kSomeOperandsOmitted;
-    } else if (!operands.empty() &&
-               std::none_of(operands.begin(), operands.end(), is_displayed)) {
+    } else if (!operands.empty() && absl::c_none_of(operands, is_displayed)) {
       // Mark nodes with *all* operands omitted appropriately.
       filter_result = kOmitNodeOperands;
     }
@@ -1368,8 +1365,7 @@ NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
     // Promote nodes with type kSomeUsersOmitted to kNormalNode if all of their
     // users made it into the graph.
     if (filter_result == kSomeUsersOmitted &&
-        std::all_of(instr->users().begin(), instr->users().end(),
-                    is_displayed)) {
+        absl::c_all_of(instr->users(), is_displayed)) {
       filter_result = kNormalNode;
     }
   }
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc
index 6e1597fd03db0a78aa560340b7b9b64fe500df0c..b01c00121b3363630b83a1e49d0027a66f3a9e1a 100644
--- a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc
+++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc
@@ -17,22 +17,34 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 
 namespace xla {
+
+bool HloInputOutputAliasConfig::OutputHasAlias(
+    const ShapeIndex& output_index) const {
+  return alias_.element(output_index).has_value();
+}
+
 Status HloInputOutputAliasConfig::SetUpAlias(const ShapeIndex& output_index,
                                              int64 param_number,
-                                             const ShapeIndex& param_index) {
+                                             const ShapeIndex& param_index,
+                                             AliasKind kind) {
+  TF_RET_CHECK(kind == AliasKind::kUserAlias || kind == AliasKind::kSystemAlias)
+      << kind;
   TF_RET_CHECK(ShapeUtil::IndexIsValid(alias_.shape(), output_index))
       << absl::StrCat("Tring to set up alias at ", output_index.ToString(),
                       " which is an invalid index for shape ",
                       ShapeUtil::HumanString(alias_.shape()));
+  TF_RET_CHECK(param_number >= 0) << param_number;
+  TF_RET_CHECK(!OutputHasAlias(output_index))
+      << "Output index " << output_index << " already has an alias setup";
   // Output can't be aliased with multiple parameters.
   TF_RET_CHECK(!alias_.element(output_index)) << absl::StrFormat(
       "Trying to set up output alias for param %lld at %s but failed: output "
       "index %s is already aliased with param %lld at %s",
       param_number, param_index.ToString(), output_index.ToString(),
-      alias_.element(output_index)->first,
-      alias_.element(output_index)->second.ToString());
+      alias_.element(output_index)->parameter_number,
+      alias_.element(output_index)->parameter_index.ToString());
   (*alias_.mutable_element(output_index)) =
-      std::make_pair(param_number, param_index);
+      Alias(kind, param_number, param_index);
   VLOG(4) << "Set up alias between output index " << output_index.ToString()
           << " and parameter " << param_index << " at index "
           << param_index.ToString();
@@ -42,15 +54,24 @@ Status HloInputOutputAliasConfig::SetUpAlias(const ShapeIndex& output_index,
 HloInputOutputAliasProto HloInputOutputAliasConfig::ToProto() const {
   HloInputOutputAliasProto result;
   alias_.ForEachElement(
-      [&](const ShapeIndex& index,
-          const absl::optional<std::pair<int64, ShapeIndex>>& data) {
+      [&](const ShapeIndex& index, const absl::optional<Alias>& data) {
         if (data) {
           HloInputOutputAliasProto::AliasEntryProto entry;
+          switch (data->kind) {
+            case AliasKind::kUserAlias:
+              entry.set_kind(HloInputOutputAliasProto::USER_ALIAS);
+              break;
+            case AliasKind::kSystemAlias:
+              entry.set_kind(HloInputOutputAliasProto::SYSTEM_ALIAS);
+              break;
+            default:
+              LOG(FATAL) << "Unknown alias kind " << data->kind;
+          }
           for (int64 i : index) {
             entry.add_output_shape_index(i);
           }
-          entry.set_parameter_number(data->first);
-          for (int64 i : data->second) {
+          entry.set_parameter_number(data->parameter_number);
+          for (int64 i : data->parameter_index) {
             entry.add_parameter_shape_index(i);
           }
           result.add_entries()->Swap(&entry);
@@ -66,14 +87,18 @@ StatusOr<HloInputOutputAliasConfig> HloInputOutputAliasConfig::CreateFromProto(
        proto.entries()) {
     ShapeIndex output_index(entry.output_shape_index().begin(),
                             entry.output_shape_index().end());
-
     int64 param_number = entry.parameter_number();
     ShapeIndex param_index(entry.parameter_shape_index().begin(),
                            entry.parameter_shape_index().end());
+    // Handle backward compatibility with existing protos, which only knew of
+    // system aliases.
+    AliasKind kind = AliasKind::kSystemAlias;
+    if (entry.kind() == HloInputOutputAliasProto::USER_ALIAS) {
+      kind = AliasKind::kUserAlias;
+    }
     TF_RETURN_IF_ERROR(
-        result.SetUpAlias(output_index, param_number, param_index));
+        result.SetUpAlias(output_index, param_number, param_index, kind));
   }
-
   return result;
 }
 
@@ -81,45 +106,44 @@ string HloInputOutputAliasConfig::ToString() const {
   std::vector<string> pieces;
   pieces.push_back("HloInputOutputAliasConfig");
 
-  ForEachAlias([&](const ShapeIndex& output_index, int64 param_number,
-                   const ShapeIndex& param_index) {
+  ForEachAlias([&](const ShapeIndex& output_index, const Alias& alias) {
+    const char* kind = alias.kind == AliasKind::kUserAlias ? "USER" : "SYSTEM";
     pieces.push_back(absl::StrFormat(
-        "  OutputIndex %s is aliased with parameter %lld at %s:",
-        output_index.ToString(), param_number, param_index.ToString()));
+        "  OutputIndex %s is aliased (kind=%s) with parameter %lld at %s:",
+        output_index.ToString(), kind, alias.parameter_number,
+        alias.parameter_index.ToString()));
   });
-
   return absl::StrJoin(pieces, "\n");
 }
 
-bool HloInputOutputAliasConfig::ParameterHasAlias(
+HloInputOutputAliasConfig::AliasKind
+HloInputOutputAliasConfig::ParameterAliasKind(
     int64 param_number, const ShapeIndex& param_index) const {
-  bool output = false;
+  AliasKind kind = AliasKind::kNoAlias;
   alias_.ForEachElement(
-      [&](const xla::ShapeIndex&,
-          absl::optional<std::pair<int64, ShapeIndex>> alias) {
-        if (alias && alias->first == param_number &&
-            alias->second == param_index) {
-          output = true;
+      [&](const xla::ShapeIndex&, absl::optional<Alias> alias) {
+        if (alias && alias->parameter_number == param_number &&
+            alias->parameter_index == param_index) {
+          kind = alias->kind;
         }
       });
-  return output;
+  return kind;
 }
 
 absl::optional<ShapeIndex> HloInputOutputAliasConfig::GetAliasedOutput(
     int64 param_number, const ShapeIndex& param_index) const {
   absl::optional<ShapeIndex> output;
   alias_.ForEachElement(
-      [&](const xla::ShapeIndex& output_index,
-          absl::optional<std::pair<int64, ShapeIndex>> alias) {
-        if (alias && alias->first == param_number &&
-            alias->second == param_index) {
+      [&](const xla::ShapeIndex& output_index, absl::optional<Alias> alias) {
+        if (alias && alias->parameter_number == param_number &&
+            alias->parameter_index == param_index) {
           output = output_index;
         }
       });
   return output;
 }
 
-absl::optional<std::pair<int64, ShapeIndex>>
+absl::optional<HloInputOutputAliasConfig::Alias>
 HloInputOutputAliasConfig::GetAliasedParameter(
     const ShapeIndex& output_index) const {
   CHECK(ShapeUtil::IndexIsValid(alias_.shape(), output_index));
@@ -128,10 +152,9 @@ HloInputOutputAliasConfig::GetAliasedParameter(
 
 void HloInputOutputAliasConfig::ForEachAlias(AliasFn fn) const {
   alias_.ForEachElement(
-      [&](const ShapeIndex& output_index,
-          absl::optional<std::pair<int64, ShapeIndex>> aliased) {
+      [&](const ShapeIndex& output_index, absl::optional<Alias> aliased) {
         if (aliased) {
-          fn(output_index, aliased->first, aliased->second);
+          fn(output_index, *aliased);
         }
       });
 }
@@ -139,10 +162,9 @@ void HloInputOutputAliasConfig::ForEachAlias(AliasFn fn) const {
 Status HloInputOutputAliasConfig::ForEachAliasWithStatus(
     AliasFnWithStatus fn) const {
   return alias_.ForEachElementWithStatus(
-      [&](const ShapeIndex& output_index,
-          absl::optional<std::pair<int64, ShapeIndex>> aliased) {
+      [&](const ShapeIndex& output_index, absl::optional<Alias> aliased) {
         if (aliased) {
-          TF_RETURN_IF_ERROR(fn(output_index, aliased->first, aliased->second));
+          TF_RETURN_IF_ERROR(fn(output_index, *aliased));
         }
         return Status::OK();
       });
@@ -158,20 +180,19 @@ Status HloInputOutputAliasConfig::Verify(
     param_has_seen.emplace_back(param->shape());
   }
   return ForEachAliasWithStatus([&](const ShapeIndex& output_index,
-                                    int64 param_number,
-                                    const ShapeIndex& param_index) -> Status {
+                                    const Alias& alias) -> Status {
     const HloInstruction* root = entry->root_instruction();
 
-    TF_RET_CHECK(0 <= param_number);
-    TF_RET_CHECK(entry->num_parameters() > param_number);
+    TF_RET_CHECK(0 <= alias.parameter_number);
+    TF_RET_CHECK(entry->num_parameters() > alias.parameter_number);
     const Shape& param_shape =
-        entry->parameter_instruction(param_number)->shape();
+        entry->parameter_instruction(alias.parameter_number)->shape();
     const Shape& output_shape = root->shape();
-    TF_RET_CHECK(ShapeUtil::IndexIsValid(param_shape, param_index));
+    TF_RET_CHECK(ShapeUtil::IndexIsValid(param_shape, alias.parameter_index));
     TF_RET_CHECK(ShapeUtil::IndexIsValid(output_shape, output_index));
 
     const Shape& param_subshape =
-        ShapeUtil::GetSubshape(param_shape, param_index);
+        ShapeUtil::GetSubshape(param_shape, alias.parameter_index);
     const Shape& output_subshape =
         ShapeUtil::GetSubshape(output_shape, output_index);
     TF_RET_CHECK(LayoutUtil::IsDenseArray(param_subshape));
@@ -182,19 +203,20 @@ Status HloInputOutputAliasConfig::Verify(
           "Expected aliased input %lld at index %s and output at index %s to "
           "have the same size. Input sub-shape is %s with size %lld, output "
           "sub-shape is %s with size %lld",
-          param_number, param_index.ToString(), output_index.ToString(),
+          alias.parameter_number, alias.parameter_index.ToString(),
+          output_index.ToString(),
           ShapeUtil::HumanStringWithLayout(param_subshape),
           size_func(param_subshape),
           ShapeUtil::HumanStringWithLayout(output_subshape),
           size_func(output_subshape));
     }
 
-    // Check each param_number and param_index pair only show up once. No
-    // input can be aliased with output buffers.
-    TF_RET_CHECK(param_has_seen[param_number].element(param_index) == false);
-
-    *(param_has_seen[param_number].mutable_element(param_index)) = true;
-
+    // Check each alias.parameter_number and alias.parameter_index pair only
+    // show up once. No input can be aliased with output buffers.
+    TF_RET_CHECK(param_has_seen[alias.parameter_number].element(
+                     alias.parameter_index) == false);
+    *(param_has_seen[alias.parameter_number].mutable_element(
+        alias.parameter_index)) = true;
     return Status::OK();
   });
 }
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
index 439676b1546c4af7f781fb80bccffd5248309b0f..b0b71dece81b561f492767db8c1ccbe3fde442d4 100644
--- a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
+++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <utility>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
@@ -31,6 +32,28 @@ class HloModule;
 // parameter index in the entry computation.
 class HloInputOutputAliasConfig {
  public:
+  // The kind of aliases which can be set. A kUserAlias is one setup at
+  // compilation time by the user, and has to be respected. A kSystemAlias one
+  // might be setup by the compiler, if it decides it is convenient to do so.
+  enum AliasKind {
+    kNoAlias,
+    kUserAlias,
+    kSystemAlias,
+  };
+
+  // Defines the alias information for a given output buffer. A given output
+  // buffer shape index can refer only to one parameter+index.
+  struct Alias {
+    Alias(AliasKind kind, int64 parameter_number, ShapeIndex parameter_index)
+        : kind(kind),
+          parameter_number(parameter_number),
+          parameter_index(std::move(parameter_index)) {}
+
+    AliasKind kind;
+    int64 parameter_number;
+    ShapeIndex parameter_index;
+  };
+
   HloInputOutputAliasConfig() = default;
 
   explicit HloInputOutputAliasConfig(Shape shape) : alias_(shape) {}
@@ -40,12 +63,22 @@ class HloInputOutputAliasConfig {
   // Sets up alias config from `output_index` to `param_index` at
   // `param_number`.
   Status SetUpAlias(const ShapeIndex& output_index, int64 param_number,
-                    const ShapeIndex& param_index);
+                    const ShapeIndex& param_index, AliasKind kind);
+
+  // Returns the kind of alias for the given parameter number and parameter
+  // index. If no alias exists, AliasKind::kNoAlias is returned.
+  AliasKind ParameterAliasKind(int64 param_number,
+                               const ShapeIndex& param_index) const;
 
   // Returns true if the given parameter is aliased with one of the output
   // buffers.
   bool ParameterHasAlias(int64 param_number,
-                         const ShapeIndex& param_index) const;
+                         const ShapeIndex& param_index) const {
+    return ParameterAliasKind(param_number, param_index) != AliasKind::kNoAlias;
+  }
+
+  // Checks whether the provided output index has already been aliased.
+  bool OutputHasAlias(const ShapeIndex& output_index) const;
 
   // (De)Serializes an HloInputOutoutAliasConfig to/from an
   // HloInputOutoutAliasProto.
@@ -63,19 +96,17 @@ class HloInputOutputAliasConfig {
   // Returns the number of parameter and index of the parameter buffer that the
   // given output buffer index is aliased with. A nullopt is returned if there
   // is no parameter is aliased with the specific output.
-  absl::optional<std::pair<int64, ShapeIndex>> GetAliasedParameter(
+  absl::optional<Alias> GetAliasedParameter(
       const ShapeIndex& output_index) const;
 
   using AliasFn =
-      std::function<void(const ShapeIndex& output_index, int64 param_number,
-                         const ShapeIndex& param_index)>;
+      std::function<void(const ShapeIndex& output_index, const Alias&)>;
 
   // Iterates through each aliased output and input.
   void ForEachAlias(AliasFn fn) const;
 
   using AliasFnWithStatus =
-      std::function<Status(const ShapeIndex& output_index, int64 param_number,
-                           const ShapeIndex& param_index)>;
+      std::function<Status(const ShapeIndex& output_index, const Alias&)>;
 
   // Verifies that the given config is valid for the given module.
   // Specifically, the config's input and output should be in-bound and size of
@@ -90,9 +121,10 @@ class HloInputOutputAliasConfig {
  private:
   // A ShapeTree which indicates the list of buffers that's expected to be
   // aliased. The key on this shape tree represents the output index. The value
-  // is a pair of parameter number and index into the buffer. If the value is
-  // nullopt, it means there is no parameter aliasing for this output.
-  ShapeTree<absl::optional<std::pair<int64, ShapeIndex>>> alias_;
+  // is an Alias data structure which defines the input parameter coordinates.
+  // If the value is nullopt, it means there is no parameter aliasing for this
+  // output.
+  ShapeTree<absl::optional<Alias>> alias_;
 };
 
 std::ostream& operator<<(std::ostream& out,
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc b/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc
index aeb9b0fdc8b6cca87731a2d4aae25120af6c3215..a46a107723de30176241aae01b268a8c10d991d3 100644
--- a/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc
@@ -45,11 +45,12 @@ class HloInputOutputAliasConfigTest : public HloTestBase {
     EXPECT_TRUE(aliased_output);
     EXPECT_EQ(aliased_output.value(), output_index);
 
-    absl::optional<std::pair<int64, ShapeIndex>> aliased_param =
+    absl::optional<HloInputOutputAliasConfig::Alias> aliased_param =
         config.GetAliasedParameter(output_index);
 
     EXPECT_TRUE(aliased_param);
-    EXPECT_EQ(aliased_param.value(), std::make_pair(param_number, param_index));
+    EXPECT_EQ(aliased_param->parameter_number, param_number);
+    EXPECT_EQ(aliased_param->parameter_index, param_index);
   }
 
   void expect_not_aliased(const ShapeIndex& output_index, int64 param_number,
@@ -60,11 +61,12 @@ class HloInputOutputAliasConfigTest : public HloTestBase {
 
     EXPECT_FALSE(aliased_output && aliased_output == output_index);
 
-    absl::optional<std::pair<int64, ShapeIndex>> aliased_param =
+    absl::optional<HloInputOutputAliasConfig::Alias> aliased_param =
         config.GetAliasedParameter(output_index);
 
-    EXPECT_FALSE(aliased_param && aliased_param->first == param_number &&
-                 aliased_param->second == param_index);
+    EXPECT_FALSE(aliased_param &&
+                 aliased_param->parameter_number == param_number &&
+                 aliased_param->parameter_index == param_index);
   }
 };
 
@@ -84,8 +86,10 @@ ENTRY main {
   HloInputOutputAliasConfig config(
       module->entry_computation()->root_instruction()->shape());
 
-  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{0}, /*param_number=*/1,
-                                 /*param_index=*/{}));
+  TF_ASSERT_OK(config.SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/1,
+      /*param_index=*/{},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   expect_aliased(/*output_index=*/{0}, /*param_number=*/1,
                  /*param_index=*/{}, config);
@@ -114,11 +118,15 @@ ENTRY main {
   HloInputOutputAliasConfig config(
       module->entry_computation()->root_instruction()->shape());
 
-  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{0}, /*param_number=*/0,
-                                 /*param_index=*/{0}));
+  TF_ASSERT_OK(config.SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/0,
+      /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
-  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{1}, /*param_number=*/0,
-                                 /*param_index=*/{1}));
+  TF_ASSERT_OK(config.SetUpAlias(
+      /*output_index=*/{1}, /*param_number=*/0,
+      /*param_index=*/{1},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   expect_aliased(/*output_index=*/{0}, /*param_number=*/0,
                  /*param_index=*/{0}, config);
@@ -149,11 +157,15 @@ ENTRY main {
   HloInputOutputAliasConfig config(
       module->entry_computation()->root_instruction()->shape());
 
-  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{0}, /*param_number=*/0,
-                                 /*param_index=*/{}));
+  TF_ASSERT_OK(config.SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/0,
+      /*param_index=*/{},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
-  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{1}, /*param_number=*/0,
-                                 /*param_index=*/{}));
+  TF_ASSERT_OK(config.SetUpAlias(
+      /*output_index=*/{1}, /*param_number=*/0,
+      /*param_index=*/{},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   ASSERT_IS_NOT_OK(config.Verify(*module, [](const Shape& shape) {
     return ShapeUtil::ByteSizeOf(shape);
@@ -176,8 +188,10 @@ ENTRY main {
   HloInputOutputAliasConfig config(
       module->entry_computation()->root_instruction()->shape());
 
-  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{1}, /*param_number=*/0,
-                                 /*param_index=*/{}));
+  TF_ASSERT_OK(config.SetUpAlias(
+      /*output_index=*/{1}, /*param_number=*/0,
+      /*param_index=*/{},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   ASSERT_IS_NOT_OK(config.Verify(*module, [](const Shape& shape) {
     return ShapeUtil::ByteSizeOf(shape);
@@ -200,11 +214,15 @@ ENTRY main {
   HloInputOutputAliasConfig config(
       module->entry_computation()->root_instruction()->shape());
 
-  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{0}, /*param_number=*/0,
-                                 /*param_index=*/{}));
+  TF_ASSERT_OK(config.SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/0,
+      /*param_index=*/{},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
-  ASSERT_IS_NOT_OK(config.SetUpAlias(/*output_index=*/{0}, /*param_number=*/1,
-                                     /*param_index=*/{}));
+  ASSERT_IS_NOT_OK(config.SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/1,
+      /*param_index=*/{},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 }
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 29155359872d061544159ea3374554a35939ff5e..029d170317cbd119a62989ad6ae115abde1977e9 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
@@ -82,15 +83,14 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     return computation_map.at(proto.called_computation_ids(index));
   };
 
-  TF_RET_CHECK(std::all_of(
-      proto.operand_ids().begin(), proto.operand_ids().end(),
-      [&instruction_map](int64 id) { return instruction_map.contains(id); }))
+  TF_RET_CHECK(
+      absl::c_all_of(proto.operand_ids(),
+                     [&](int64 id) { return instruction_map.contains(id); }))
       << proto.name() << " instruction contains invalid operand id(s)";
 
-  TF_RET_CHECK(std::all_of(
-      proto.called_computation_ids().begin(),
-      proto.called_computation_ids().end(),
-      [&computation_map](int64 id) { return computation_map.contains(id); }))
+  TF_RET_CHECK(
+      absl::c_all_of(proto.called_computation_ids(),
+                     [&](int64 id) { return computation_map.contains(id); }))
       << proto.name() << " instruction references invalid computation id(s)";
 
   Shape shape(proto.shape());
@@ -452,13 +452,50 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           CreatePad(shape, operands(0), operands(1), proto.padding_config());
       break;
     case HloOpcode::kDynamicSlice: {
-      TF_RET_CHECK(proto.operand_ids_size() == 2)
-          << "DynamicSlice instruction should have 2 operands but sees "
-          << proto.operand_ids_size();
       std::vector<int64> slice_sizes(proto.dynamic_slice_sizes_size());
       absl::c_copy(proto.dynamic_slice_sizes(), slice_sizes.begin());
-      instruction =
-          CreateDynamicSlice(shape, operands(0), operands(1), slice_sizes);
+      TF_RET_CHECK(proto.operand_ids_size() >= 1)
+          << "DynamicSlice instruction should have at least 1 operands but "
+             "sees "
+          << proto.operand_ids_size();
+      if (proto.operand_ids_size() == 2 && operands(1)->shape().rank() == 1) {
+        // TODO(b/118437727): Old form, remove this path.
+        instruction =
+            CreateDynamicSlice(shape, operands(0), operands(1), slice_sizes);
+      } else {
+        // New form
+        auto expected_operands = 1 + operands(0)->shape().rank();
+        TF_RET_CHECK(proto.operand_ids_size() == expected_operands)
+            << "DynamicSlice instruction should have " << expected_operands
+            << " operands, but has " << proto.operand_ids_size();
+        const auto& operand_vector = all_operands();
+        instruction = CreateDynamicSlice(
+            shape, operands(0), absl::MakeSpan(operand_vector).subspan(1),
+            slice_sizes);
+      }
+      break;
+    }
+    case HloOpcode::kDynamicUpdateSlice: {
+      TF_RET_CHECK(proto.operand_ids_size() >= 2)
+          << "DynamicUpdateSlice instruction should have at least 2 operands "
+             "but sees "
+          << proto.operand_ids_size();
+      if (proto.operand_ids_size() == 3 && operands(2)->shape().rank() == 1) {
+        // TODO(b/118437727): Old form, remove this path.
+        instruction = CreateDynamicUpdateSlice(shape, operands(0), operands(1),
+                                               operands(2));
+      } else {
+        // New form
+        auto expected_operands = 2 + operands(0)->shape().rank();
+        TF_RET_CHECK(proto.operand_ids_size() == expected_operands)
+            << "DynamicUpdateSlice instruction should have "
+            << expected_operands << " operands, but has "
+            << proto.operand_ids_size();
+        const auto& operand_vector = all_operands();
+        instruction =
+            CreateDynamicUpdateSlice(shape, operands(0), operands(1),
+                                     absl::MakeSpan(operand_vector).subspan(2));
+      }
       break;
     }
     case HloOpcode::kGather: {
@@ -917,6 +954,14 @@ HloInstruction::CreateAddDependency(HloInstruction* data_operand,
       shape, operand, start_indices, slice_sizes);
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateDynamicSlice(
+    const Shape& shape, HloInstruction* operand,
+    absl::Span<HloInstruction* const> start_indices,
+    absl::Span<const int64> slice_sizes) {
+  return absl::make_unique<HloDynamicSliceInstruction>(
+      shape, operand, start_indices, slice_sizes);
+}
+
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateDynamicUpdateSlice(const Shape& shape,
                                          HloInstruction* operand,
@@ -926,6 +971,14 @@ HloInstruction::CreateDynamicUpdateSlice(const Shape& shape,
       shape, operand, update, start_indices);
 }
 
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateDynamicUpdateSlice(
+    const Shape& shape, HloInstruction* operand, HloInstruction* update,
+    absl::Span<HloInstruction* const> start_indices) {
+  return absl::make_unique<HloDynamicUpdateSliceInstruction>(
+      shape, operand, update, start_indices);
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateConcatenate(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
     int64 dimension) {
@@ -1382,9 +1435,8 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       clone = CreateReshape(shape, new_operands[0]);
       break;
     case HloOpcode::kDynamicUpdateSlice:
-      CHECK_EQ(new_operands.size(), 3);
       clone = CreateDynamicUpdateSlice(shape, new_operands[0], new_operands[1],
-                                       new_operands[2]);
+                                       new_operands.subspan(2));
       break;
     case HloOpcode::kTuple:
       clone = CreateTuple(new_operands);
@@ -1546,12 +1598,10 @@ HloInstruction::InstructionVector HloInstruction::unique_operands() const {
 
 Status HloInstruction::AddControlDependencyTo(HloInstruction* instruction) {
   TF_RET_CHECK(instruction->parent() == parent());
-  if (std::find(control_successors_.begin(), control_successors_.end(),
-                instruction) == control_successors_.end()) {
+  if (!absl::c_linear_search(control_successors_, instruction)) {
     control_successors_.push_back(instruction);
-    TF_RET_CHECK(std::find(instruction->control_predecessors_.begin(),
-                           instruction->control_predecessors_.end(),
-                           this) == instruction->control_predecessors_.end());
+    TF_RET_CHECK(
+        !absl::c_linear_search(instruction->control_predecessors_, this));
     instruction->control_predecessors_.push_back(this);
   }
   return Status::OK();
@@ -1800,7 +1850,7 @@ void HloInstruction::RemoveUser(HloInstruction* user) {
   user_set_.erase(set_it);
   // This is linear in the number of the users, but a vector provides a stable
   // iteration order and much faster traversal.
-  auto vec_it = std::find(users_.begin(), users_.end(), user);
+  auto vec_it = absl::c_find(users_, user);
   CHECK(vec_it != users_.end());
   users_.erase(vec_it);
 }
@@ -1818,8 +1868,7 @@ Status HloInstruction::ReplaceUseWith(HloInstruction* user,
 
   RemoveUser(user);
 
-  TF_RET_CHECK(
-      std::count(user->operands_.begin(), user->operands_.end(), this) >= 0);
+  TF_RET_CHECK(absl::c_count(user->operands_, this) >= 0);
   std::replace(user->operands_.begin(), user->operands_.end(), this,
                new_producer);
   new_producer->AddUser(user);
@@ -1832,6 +1881,16 @@ Status HloInstruction::ReplaceUseWith(HloInstruction* user,
 
 Status HloInstruction::ReplaceOperandWith(int64 operand_num,
                                           HloInstruction* new_operand) {
+  auto old_operand = operand(operand_num);
+  TF_RET_CHECK(ShapeUtil::CompatibleIgnoringFpPrecision(old_operand->shape(),
+                                                        new_operand->shape()))
+      << old_operand->shape() << " is not compatible with "
+      << new_operand->shape();
+  return ReplaceOperandWithDifferentShape(operand_num, new_operand);
+}
+
+Status HloInstruction::ReplaceOperandWithDifferentShape(
+    int64 operand_num, HloInstruction* new_operand) {
   TF_RET_CHECK(operand_num >= 0);
   TF_RET_CHECK(operand_num < operand_count());
   HloInstruction* old_operand = mutable_operand(operand_num);
@@ -1839,17 +1898,12 @@ Status HloInstruction::ReplaceOperandWith(int64 operand_num,
     return Status::OK();
   }
 
-  TF_RET_CHECK(ShapeUtil::CompatibleIgnoringFpPrecision(old_operand->shape(),
-                                                        new_operand->shape()))
-      << old_operand->shape() << " is not compatible with "
-      << new_operand->shape();
   operands_[operand_num] = new_operand;
 
   VLOG(3) << "Replacing operand " << operand_num << " of " << name() << " with "
           << new_operand->name() << ", was " << old_operand->name();
 
-  if (std::find(operands_.begin(), operands_.end(), old_operand) ==
-      operands_.end()) {
+  if (!absl::c_linear_search(operands_, old_operand)) {
     old_operand->RemoveUser(this);
   }
   new_operand->AddUser(this);
@@ -1857,6 +1911,14 @@ Status HloInstruction::ReplaceOperandWith(int64 operand_num,
 }
 
 Status HloInstruction::ReplaceAllUsesWith(HloInstruction* new_producer) {
+  TF_RET_CHECK(
+      ShapeUtil::CompatibleIgnoringFpPrecision(shape(), new_producer->shape()))
+      << shape() << " is not compatible with " << new_producer->shape();
+  return ReplaceAllUsesWithDifferentShape(new_producer);
+}
+
+Status HloInstruction::ReplaceAllUsesWithDifferentShape(
+    HloInstruction* new_producer) {
   bool new_producer_is_user = false;
   for (HloInstruction* user : users()) {
     if (user == new_producer) {
@@ -1881,7 +1943,8 @@ Status HloInstruction::ReplaceAllUsesWith(HloInstruction* new_producer) {
     AddUser(new_producer);
   }
   if (parent_ && parent_->root_instruction() == this) {
-    parent_->set_root_instruction(new_producer);
+    parent_->set_root_instruction(new_producer,
+                                  /*accept_different_shape=*/true);
   }
 
   return Status::OK();
@@ -2824,7 +2887,7 @@ HloInstruction::UseKind HloInstruction::OperandElementUse(int64 i) const {
       }
       return UseKind::kReuse;
     case HloOpcode::kDynamicUpdateSlice:
-      // Dynamic-update-slice reuses only operand 2 (start_indices).
+      // Dynamic-update-slice reuses only start_indices.
       if (i == 0 || i == 1) {
         return UseKind::kUse;
       }
@@ -2877,10 +2940,10 @@ StatusOr<HloInstruction::FusionKind> StringToFusionKind(
 
 string PaddingConfigToString(const PaddingConfig& padding) {
   bool has_interior_padding =
-      std::any_of(padding.dimensions().begin(), padding.dimensions().end(),
-                  [](const PaddingConfig::PaddingConfigDimension& dim) {
-                    return dim.interior_padding() != 0;
-                  });
+      absl::c_any_of(padding.dimensions(),
+                     [](const PaddingConfig::PaddingConfigDimension& dim) {
+                       return dim.interior_padding() != 0;
+                     });
   return StrJoin(
       padding.dimensions(), "x",
       [&](string* out, const PaddingConfig::PaddingConfigDimension& dim) {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 36e1ab49319a3e28143ef4d08888c68c86fbcf62..789ace6a263b37e08858b534028c26554105eb30 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -556,15 +556,26 @@ class HloInstruction {
   // Creates a slice instruction, where the first operand is sliced by
   // start indices specified in the second operand, and by size specified in
   // 'slice_sizes'.
+  ABSL_DEPRECATED("Use span-of-indices form instead")
   static std::unique_ptr<HloInstruction> CreateDynamicSlice(
       const Shape& shape, HloInstruction* operand,
       HloInstruction* start_indices, absl::Span<const int64> slice_sizes);
+  // Same as above, but expects a span of scalar start indices.
+  static std::unique_ptr<HloInstruction> CreateDynamicSlice(
+      const Shape& shape, HloInstruction* operand,
+      absl::Span<HloInstruction* const> start_indices,
+      absl::Span<const int64> slice_sizes);
 
   // Creates a dynamic update slice instruction, which updates a slice
   // of 'operand' with 'update' and 'start_indices'.
+  ABSL_DEPRECATED("Use span-of-indices form instead")
   static std::unique_ptr<HloInstruction> CreateDynamicUpdateSlice(
       const Shape& shape, HloInstruction* operand, HloInstruction* update,
       HloInstruction* start_indices);
+  // Same as above, but expects a span of scalar start indices.
+  static std::unique_ptr<HloInstruction> CreateDynamicUpdateSlice(
+      const Shape& shape, HloInstruction* operand, HloInstruction* update,
+      absl::Span<HloInstruction* const> start_indices);
 
   // Creates a concatenate instruction, where the operands are concatenated on
   // the provided dimension.
@@ -928,11 +939,16 @@ class HloInstruction {
   // operands of it which could be created due to this replacement.
   Status ReplaceUseWith(HloInstruction* user, HloInstruction* new_producer);
 
-  // Replaces the specified operand with new_operand.
+  // Replaces the specified operand with new_operand. The old and new operands
+  // must have compatible shapes ignoring floating-point precision.
   //
   // This function does NOT remove duplicated operands even if this instruction
   // is a fusion, so that the existing operand numbers do not change.
-  Status ReplaceOperandWith(int64 operand_no, HloInstruction* new_operand);
+  Status ReplaceOperandWith(int64 operand_num, HloInstruction* new_operand);
+
+  // Same as ReplaceOperandWith(), but new_operand can have a different shape.
+  Status ReplaceOperandWithDifferentShape(int64 operand_num,
+                                          HloInstruction* new_operand);
 
   // Replaces all uses of this instruction with the new producer. If
   // new_producer is a user of this instruction then new_producer remains a use
@@ -941,10 +957,16 @@ class HloInstruction {
   // If this instruction is the root of its computation, sets the computation's
   // root to new_producer.
   //
+  // The new producer must have a compatible shape ignoring floating-point
+  // precision.
+  //
   // If a user is a fusion instruction, this function will remove any duplicated
   // operands of it which could be created due to this replacement.
   Status ReplaceAllUsesWith(HloInstruction* new_producer);
 
+  // Same as ReplaceAllUsesWith, but new_producer can have a different shape.
+  Status ReplaceAllUsesWithDifferentShape(HloInstruction* new_producer);
+
   // Performs a postorder DFS visit using this node as the root. If
   // call_finish_visit is true, then DfsHloVisitor::FinishVisit is called when
   // complete. If ignore_control_predecessors is true, instructions only
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index 8048e332cb57747286758b75773b29ba154aa888..35f031f29a7aca8db7ebe2fbcfdcebb7a778d703 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
@@ -55,13 +56,13 @@ class OpAndUserCollectingVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleParameter(HloInstruction* parameter) override {
-    EXPECT_EQ(0, count_.count(parameter));
+    EXPECT_FALSE(count_.contains(parameter));
     count_[parameter] = GetCountsForNode(parameter);
     return Status::OK();
   }
 
   Status HandleConstant(HloInstruction* constant) override {
-    EXPECT_EQ(0, count_.count(constant));
+    EXPECT_FALSE(count_.contains(constant));
     count_[constant] = GetCountsForNode(constant);
     return Status::OK();
   }
@@ -69,25 +70,25 @@ class OpAndUserCollectingVisitor : public DfsHloVisitorWithDefault {
   Status HandleAdd(HloInstruction* add) override {
     auto lhs = add->operand(0);
     auto rhs = add->operand(1);
-    EXPECT_EQ(0, count_.count(add));
-    EXPECT_GT(count_.count(lhs), 0);
-    EXPECT_GT(count_.count(rhs), 0);
+    EXPECT_FALSE(count_.contains(add));
+    EXPECT_TRUE(count_.contains(lhs));
+    EXPECT_TRUE(count_.contains(rhs));
     count_[add] = GetCountsForNode(add);
     return Status::OK();
   }
 
   Status HandleNegate(HloInstruction* negate) override {
     auto operand = negate->operand(0);
-    EXPECT_EQ(0, count_.count(negate));
-    EXPECT_GT(count_.count(operand), 0);
+    EXPECT_FALSE(count_.contains(negate));
+    EXPECT_TRUE(count_.contains(operand));
     count_[negate] = GetCountsForNode(negate);
     return Status::OK();
   }
 
   Status HandleMap(HloInstruction* map) override {
-    EXPECT_EQ(0, count_.count(map));
+    EXPECT_FALSE(count_.contains(map));
     for (HloInstruction* arg : map->operands()) {
-      EXPECT_GT(count_.count(arg), 0);
+      EXPECT_TRUE(count_.contains(arg));
     }
     count_[map] = GetCountsForNode(map);
     return Status::OK();
@@ -96,9 +97,9 @@ class OpAndUserCollectingVisitor : public DfsHloVisitorWithDefault {
   Status HandleReduce(HloInstruction* reduce) override {
     auto arg = reduce->operand(0);
     auto init_value = reduce->operand(1);
-    EXPECT_EQ(0, count_.count(reduce));
-    EXPECT_GT(count_.count(arg), 0);
-    EXPECT_GT(count_.count(init_value), 0);
+    EXPECT_FALSE(count_.contains(reduce));
+    EXPECT_TRUE(count_.contains(arg));
+    EXPECT_TRUE(count_.contains(init_value));
     count_[reduce] = GetCountsForNode(reduce);
     return Status::OK();
   }
@@ -128,7 +129,7 @@ class OpAndUserCollectingVisitor : public DfsHloVisitorWithDefault {
   }
 
   // Counters for HLOs. Maps HLO to a NumOpsAndUsers.
-  std::unordered_map<const HloInstruction*, NumOpsAndUsers> count_;
+  absl::flat_hash_map<const HloInstruction*, NumOpsAndUsers> count_;
 };
 
 TEST_F(HloInstructionTest, BasicProperties) {
@@ -137,7 +138,7 @@ TEST_F(HloInstructionTest, BasicProperties) {
   EXPECT_EQ(HloOpcode::kParameter, parameter->opcode());
   EXPECT_TRUE(ShapeUtil::IsScalarWithElementType(parameter->shape(), F32));
   EXPECT_FALSE(ShapeUtil::IsScalarWithElementType(parameter->shape(), S32));
-  EXPECT_EQ(0, parameter->operand_count());
+  EXPECT_FALSE(parameter->operand_count());
 }
 
 TEST_F(HloInstructionTest, UserWithTwoOperands) {
@@ -981,9 +982,9 @@ TEST_F(HloInstructionTest, FunctionVisitor) {
   module->AddEntryComputation(builder.Build());
 
   int visit_num = 0;
-  std::unordered_map<HloInstruction*, int> visit_order;
+  absl::flat_hash_map<HloInstruction*, int> visit_order;
   EXPECT_IS_OK(add->Accept([&visit_num, &visit_order](HloInstruction* inst) {
-    EXPECT_EQ(0, visit_order.count(inst));
+    EXPECT_FALSE(visit_order.contains(inst));
     visit_order[inst] = visit_num;
     visit_num++;
     return Status::OK();
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 1f37b284a2606f5256a02510a93bcec8664b0eb6..c24bf41ff8e0853d97f512be98603133c0d4f63d 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -42,11 +42,9 @@ using absl::StrJoin;
 bool IsInstructionElementwiseOnOperand(const HloInstruction* instruction,
                                        const HloInstruction* operand) {
   std::vector<int64> operand_indices = instruction->OperandIndices(operand);
-  return std::all_of(
-      operand_indices.begin(), operand_indices.end(),
-      [instruction](int64 operand_index) {
-        return instruction->IsElementwiseOnOperand(operand_index);
-      });
+  return absl::c_all_of(operand_indices, [instruction](int64 operand_index) {
+    return instruction->IsElementwiseOnOperand(operand_index);
+  });
 }
 
 string PrecisionConfigToString(const PrecisionConfig& precision_config) {
@@ -814,8 +812,7 @@ std::vector<string> HloSliceInstruction::ExtraAttributesToStringImpl(
   std::vector<string> bounds;
   bounds.reserve(slice_starts_.size());
   const bool omit_stride =
-      std::all_of(slice_strides_.begin(), slice_strides_.end(),
-                  [](int64 stride) { return stride == 1; });
+      absl::c_all_of(slice_strides_, [](int64 stride) { return stride == 1; });
   for (int i = 0; i < slice_starts_.size(); ++i) {
     string stride_str = omit_stride ? "" : StrCat(":", slice_strides_[i]);
     bounds.push_back(
@@ -1051,8 +1048,7 @@ HloInstruction* HloFusionInstruction::AddFusionOperand(
 
 void HloFusionInstruction::MergeFusionInstruction(
     HloFusionInstruction* instruction_to_merge) {
-  CHECK(std::find(operands().begin(), operands().end(), instruction_to_merge) !=
-        operands().end());
+  CHECK(absl::c_linear_search(operands(), instruction_to_merge));
   // Clone the instruction from which to merge fused instructions.
   std::unique_ptr<HloInstruction> cloned = instruction_to_merge->Clone();
   HloFusionInstruction* cloned_fusion =
@@ -1219,8 +1215,8 @@ HloInstruction* HloFusionInstruction::CloneAndFuseInternal(
     // corresponding fused parameter instruction. Renumber parameters as
     // necessary to make parameter numbers consistent with their index in the
     // fused_parameter_ vector.
-    bool in_operand_list = std::find(operands().begin(), operands().end(),
-                                     instruction_to_fuse) != operands().end();
+    bool in_operand_list =
+        absl::c_linear_search(operands(), instruction_to_fuse);
     CHECK(add_output || in_operand_list);
     if (instruction_to_fuse->opcode() == HloOpcode::kTuple) {
       // We assume all uses of a kTuple operation are GTE ops, not another
@@ -1324,7 +1320,7 @@ HloInstruction* HloFusionInstruction::CloneAndFuseInternal(
     if (newly_created_tuple_instr) {
       HloInstruction* new_instr = parent()->AddInstruction(
           HloInstruction::CreateGetTupleElement(fused_root->shape(), this, 0));
-      TF_CHECK_OK(ReplaceAllUsesWith(new_instr));
+      TF_CHECK_OK(ReplaceAllUsesWithDifferentShape(new_instr));
     }
     int64 index = tuple_elements.size();
     if (instruction_to_fuse->opcode() == HloOpcode::kTuple) {
@@ -2007,6 +2003,18 @@ HloDynamicSliceInstruction::HloDynamicSliceInstruction(
   AppendOperand(start_indices);
 }
 
+HloDynamicSliceInstruction::HloDynamicSliceInstruction(
+    const Shape& shape, HloInstruction* operand,
+    absl::Span<HloInstruction* const> start_indices,
+    absl::Span<const int64> slice_sizes)
+    : HloDynamicIndexInstruction(HloOpcode::kDynamicSlice, shape),
+      dynamic_slice_sizes_(slice_sizes.begin(), slice_sizes.end()) {
+  AppendOperand(operand);
+  for (HloInstruction* index : start_indices) {
+    AppendOperand(index);
+  }
+}
+
 HloDynamicUpdateSliceInstruction::HloDynamicUpdateSliceInstruction(
     const Shape& shape, HloInstruction* operand, HloInstruction* update,
     HloInstruction* start_indices)
@@ -2016,6 +2024,17 @@ HloDynamicUpdateSliceInstruction::HloDynamicUpdateSliceInstruction(
   AppendOperand(start_indices);
 }
 
+HloDynamicUpdateSliceInstruction::HloDynamicUpdateSliceInstruction(
+    const Shape& shape, HloInstruction* operand, HloInstruction* update,
+    absl::Span<HloInstruction* const> start_indices)
+    : HloDynamicIndexInstruction(HloOpcode::kDynamicUpdateSlice, shape) {
+  AppendOperand(operand);
+  AppendOperand(update);
+  for (HloInstruction* index : start_indices) {
+    AppendOperand(index);
+  }
+}
+
 HloInstructionProto HloDynamicSliceInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   for (int64 slice_size : dynamic_slice_sizes_) {
@@ -2041,9 +2060,14 @@ std::unique_ptr<HloInstruction>
 HloDynamicSliceInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* context) const {
-  CHECK_EQ(new_operands.size(), 2);
-  return absl::make_unique<HloDynamicSliceInstruction>(
-      shape, new_operands[0], new_operands[1], dynamic_slice_sizes_);
+  if (new_operands.size() == 2 && new_operands[1]->shape().rank() == 1) {
+    // TODO(b/118437727): Old form, remove this path.
+    return absl::make_unique<HloDynamicSliceInstruction>(
+        shape, new_operands[0], new_operands[1], dynamic_slice_sizes_);
+  } else {
+    return absl::make_unique<HloDynamicSliceInstruction>(
+        shape, new_operands[0], new_operands.subspan(1), dynamic_slice_sizes_);
+  }
 }
 
 HloGatherInstruction::HloGatherInstruction(
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index ca212c7f2c98f75ceefc14b7fbc2a1f530c06cf7..e6111cfb57581589070b8e34556bdfe8239b4fd3 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -1183,7 +1183,22 @@ class HloDynamicIndexInstruction : public HloInstruction {
  public:
   explicit HloDynamicIndexInstruction(HloOpcode opcode, const Shape& shape)
       : HloInstruction(opcode, shape) {}
-  virtual int64 index_operand_number() const = 0;
+  virtual int64 first_index_operand_number() const = 0;
+
+  // Returns a subspan of operands which represent the start indices.
+  absl::Span<HloInstruction* const> index_operands() const {
+    return absl::MakeSpan(operands()).subspan(first_index_operand_number());
+  }
+
+  // Returns the shapes of the index operands.
+  std::vector<Shape> index_shapes() const {
+    std::vector<Shape> shapes;
+    auto indices = index_operands();
+    for (const HloInstruction* index : indices) {
+      shapes.push_back(index->shape());
+    }
+    return shapes;
+  }
 };
 
 class HloDynamicSliceInstruction : public HloDynamicIndexInstruction {
@@ -1192,6 +1207,10 @@ class HloDynamicSliceInstruction : public HloDynamicIndexInstruction {
                                       HloInstruction* operand,
                                       HloInstruction* start_indices,
                                       absl::Span<const int64> slice_sizes);
+  explicit HloDynamicSliceInstruction(
+      const Shape& shape, HloInstruction* operand,
+      absl::Span<HloInstruction* const> start_indices,
+      absl::Span<const int64> slice_sizes);
   // Old methods kept for smooth subclassing transition END.
   // Returns the size of the slice in the given dimension for a dynamic
   // slice node.
@@ -1204,7 +1223,7 @@ class HloDynamicSliceInstruction : public HloDynamicIndexInstruction {
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
-  int64 index_operand_number() const override { return 1; }
+  int64 first_index_operand_number() const override { return 1; }
 
  private:
   std::vector<string> ExtraAttributesToStringImpl(
@@ -1229,8 +1248,11 @@ class HloDynamicUpdateSliceInstruction : public HloDynamicIndexInstruction {
                                             HloInstruction* operand,
                                             HloInstruction* update,
                                             HloInstruction* start_indices);
+  explicit HloDynamicUpdateSliceInstruction(
+      const Shape& shape, HloInstruction* operand, HloInstruction* update,
+      absl::Span<HloInstruction* const> start_indices);
 
-  int64 index_operand_number() const override { return 2; }
+  int64 first_index_operand_number() const override { return 2; }
 };
 
 class HloGatherInstruction : public HloInstruction {
diff --git a/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc b/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc
index 5bf055f3c012fef687cdc275d62efdf2d4cd5e5c..e14bcfa7f67e736a4d04f5b236fb2df02cf150e0 100644
--- a/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <deque>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/map_util.h"
@@ -36,11 +37,11 @@ namespace xla {
 namespace {
 
 using Worklist = std::deque<const HloInstruction*>;
-using Workset = std::unordered_set<const HloInstruction*>;
+using Workset = absl::flat_hash_set<const HloInstruction*>;
 
 void AddToWorklist(const HloInstruction* instruction, Worklist* worklist,
                    Workset* workset) {
-  if (workset->count(instruction) == 0) {
+  if (!workset->contains(instruction)) {
     worklist->push_back(instruction);
     workset->insert(instruction);
     VLOG(3) << "ADD instruction: " << instruction->name();
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index fe8371384c0fa3900a9022f101ff0b296439cf16..258f918f47a313b4b89fb260457b1b119dc16177 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -107,11 +107,10 @@ HloComputation* HloModule::AddEntryComputation(
 }
 
 Status HloModule::RemoveEmbeddedComputation(HloComputation* to_remove) {
-  auto it =
-      std::find_if(computations_.begin(), computations_.end(),
-                   [&to_remove](const std::unique_ptr<HloComputation>& comp) {
-                     return comp.get() == to_remove;
-                   });
+  auto it = absl::c_find_if(
+      computations_, [&to_remove](const std::unique_ptr<HloComputation>& comp) {
+        return comp.get() == to_remove;
+      });
   TF_RET_CHECK(it->get() == to_remove);
   computations_.erase(it);
   return Status::OK();
@@ -304,11 +303,10 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
   auto module = absl::make_unique<HloModule>(proto.name(), module_config);
 
   // Sort the computations in the proto id's order.
-  std::sort(computations.begin(), computations.end(),
-            [&](const std::unique_ptr<HloComputation>& a,
-                const std::unique_ptr<HloComputation>& b) {
-              return to_proto_id[a.get()] < to_proto_id[b.get()];
-            });
+  absl::c_sort(computations, [&](const std::unique_ptr<HloComputation>& a,
+                                 const std::unique_ptr<HloComputation>& b) {
+    return to_proto_id[a.get()] < to_proto_id[b.get()];
+  });
 
   // Add sorted computations to the module.
   for (auto& computation : computations) {
@@ -392,15 +390,12 @@ namespace {
 // Returns whether `hlo` is used outside the given subcomputation.
 // `instructions_in_subcomputation` is the instruction set of the given
 // subcomputation.
-bool IsUsedOutsideSubcomputation(
-    const HloInstruction& hlo,
-    const std::unordered_set<HloInstruction*>& instructions_in_subcomputation) {
-  for (HloInstruction* user : hlo.users()) {
-    if (!instructions_in_subcomputation.count(user)) {
-      return true;
-    }
-  }
-  return false;
+bool IsUsedOutsideSubcomputation(const HloInstruction& hlo,
+                                 const absl::flat_hash_set<HloInstruction*>&
+                                     instructions_in_subcomputation) {
+  return absl::c_any_of(hlo.users(), [&](HloInstruction* user) {
+    return !instructions_in_subcomputation.contains(user);
+  });
 }
 }  // anonymous namespace
 
@@ -411,9 +406,9 @@ HloInstruction* HloModule::OutlineExpressionFromComputation(
 
   // A map from original instructions to their counterparts in the new outlined
   // function.
-  std::unordered_map<HloInstruction*, HloInstruction*> outlined_instructions;
+  absl::flat_hash_map<HloInstruction*, HloInstruction*> outlined_instructions;
   // A set that contains all instructions to be outlined.
-  std::unordered_set<HloInstruction*> instruction_set_to_outline(
+  absl::flat_hash_set<HloInstruction*> instruction_set_to_outline(
       instructions_to_outline.begin(), instructions_to_outline.end());
   std::vector<HloInstruction*> arguments;
   std::vector<HloInstruction*> outputs;
@@ -502,7 +497,7 @@ std::vector<HloComputation*> HloModule::MakeComputationPostOrder() const {
   // First determine all root computations by building a set of nonroot
   // computations (computations which are called by an instruction in the
   // module).
-  std::set<HloComputation*> nonroot_computations;
+  absl::flat_hash_set<HloComputation*> nonroot_computations;
   for (auto& computation : computations_) {
     for (auto* instruction : computation->instructions()) {
       for (HloComputation* called_computation :
@@ -515,19 +510,19 @@ std::vector<HloComputation*> HloModule::MakeComputationPostOrder() const {
   // Keep track of computations which have already been added to the post
   // order. This prevents duplication as an embedded computation may be called
   // from two different root computations.
-  std::set<HloComputation*> added_computations;
+  absl::flat_hash_set<HloComputation*> added_computations;
   std::vector<HloComputation*> post_order;
   for (auto& computation : computations_) {
-    if (nonroot_computations.count(computation.get()) == 0) {
+    if (!nonroot_computations.contains(computation.get())) {
       for (HloComputation* embedded_computation :
            computation->MakeEmbeddedComputationsList()) {
-        if (added_computations.count(embedded_computation) == 0) {
+        if (!added_computations.contains(embedded_computation)) {
           post_order.push_back(embedded_computation);
           added_computations.insert(embedded_computation);
         }
       }
       // Root computations should only be encountered once.
-      CHECK_EQ(0, added_computations.count(computation.get()));
+      CHECK(!added_computations.contains(computation.get()));
       post_order.push_back(computation.get());
       added_computations.insert(computation.get());
     }
diff --git a/tensorflow/compiler/xla/service/hlo_module_dce_test.cc b/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
index e535b7d74943943069b4d795cf999a3b1e963360..f6e2866204955ac024c2b6f972de449cc3df4c15 100644
--- a/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
@@ -38,9 +38,7 @@ class HloModuleDceTest : public HloTestBase {
   // Returns whether the given instruction exists in the given computation.
   bool HasInstruction(const HloComputation& computation,
                       const HloInstruction* instruction) {
-    return std::find(computation.instructions().begin(),
-                     computation.instructions().end(),
-                     instruction) != computation.instructions().end();
+    return absl::c_linear_search(computation.instructions(), instruction);
   }
 
   // Returns whether the while instruction with name 'while_name' in
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
index 80f8ca2226b601d80af70395e485eb73246ddacb..47734bc55cc00d605f4e318400be88639450343c 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
@@ -199,7 +199,7 @@ bool HloModuleGroupMetadata::IsChannelInstruction(
 }
 
 bool HloModuleGroupMetadata::IsCompanionInstruction(HloInstruction* hlo) const {
-  return companion_set_index_.count(hlo) > 0;
+  return companion_set_index_.contains(hlo);
 }
 
 bool HloModuleGroupMetadata::InstructionCommunicates(
@@ -510,7 +510,7 @@ Status HloModuleGroupMetadata::CheckCommunicatingInstruction(
   HloComputation* computation = instruction->parent();
   const HloModule* module = computation->parent();
   if (module->entry_computation() == computation ||
-      tracked_instructions_.count(computation) > 0) {
+      tracked_instructions_.contains(computation)) {
     return Status::OK();
   }
   return FailedPrecondition("channel is used in disallowed computation");
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
index 5cd0e38f3883a325dffe586f6ec46be9f6b799ab..3ed95c10504141139d83eb8679a0b8144b15ad0d 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
@@ -178,7 +178,7 @@ class HloModuleGroupMetadata {
   // Precondition: IsCompanionWhile(instruction) is true.
   const std::vector<HloInstruction*>& Companions(
       const HloInstruction* instruction) const {
-    CHECK_EQ(companion_set_index_.count(instruction), 1);
+    CHECK(companion_set_index_.contains(instruction));
     return companion_set(companion_set_index_.at(instruction));
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index ca6a154809be46d6a0305c29e2b89219de408019..0cec61c257bb84e467290fb52ec9063a32ed558d 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -367,7 +367,7 @@ bool SequentialHloOrdering::ExecutesBeforeInSameComputation(
     const HloInstruction* a, const HloInstruction* b) const {
   CHECK_EQ(a->parent(), b->parent());
   // If either instruction is not in the order, then 'a' and 'b' are unordered.
-  if (order_position_.count(a) == 0 || order_position_.count(b) == 0) {
+  if (!order_position_.contains(a) || !order_position_.contains(b)) {
     return false;
   }
   return order_position_.at(a) < order_position_.at(b);
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 56848ce0e8558132d9706c5134db6a4dcef6bdfe..638396308c2a9c1f20e47f78b594d54f07c0c4e5 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -1171,24 +1171,39 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       optional<std::vector<tensorflow::int64>> dynamic_slice_sizes;
       attrs["dynamic_slice_sizes"] = {
           /*required=*/true, AttrTy::kBracedInt64List, &dynamic_slice_sizes};
-      if (!ParseOperands(&operands, /*expected_size=*/2) ||
-          !ParseAttributes(attrs)) {
+      LocTy loc = lexer_.GetLoc();
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
         return false;
       }
+      if (operands.empty()) {
+        return Error(loc, "Expected at least one operand.");
+      }
+      if (!(operands.size() == 2 && operands[1]->shape().rank() == 1) &&
+          operands.size() != 1 + operands[0]->shape().rank()) {
+        return Error(loc, "Wrong number of operands.");
+      }
       instruction = builder->AddInstruction(HloInstruction::CreateDynamicSlice(
-          shape, /*operand=*/operands[0], /*start_indices=*/operands[1],
+          shape, /*operand=*/operands[0],
+          /*start_indices=*/absl::MakeSpan(operands).subspan(1),
           *dynamic_slice_sizes));
       break;
     }
     case HloOpcode::kDynamicUpdateSlice: {
-      if (!ParseOperands(&operands, /*expected_size=*/3) ||
-          !ParseAttributes(attrs)) {
+      LocTy loc = lexer_.GetLoc();
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
         return false;
       }
+      if (operands.size() < 2) {
+        return Error(loc, "Expected at least two operands.");
+      }
+      if (!(operands.size() == 3 && operands[2]->shape().rank() == 1) &&
+          operands.size() != 2 + operands[0]->shape().rank()) {
+        return Error(loc, "Wrong number of operands.");
+      }
       instruction =
           builder->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
               shape, /*operand=*/operands[0], /*update=*/operands[1],
-              /*start_indices=*/operands[2]));
+              /*start_indices=*/absl::MakeSpan(operands).subspan(2)));
       break;
     }
     case HloOpcode::kTranspose: {
@@ -2731,7 +2746,7 @@ bool HloParser::ParseConvolutionDimensionNumbers(
   }
 
   auto is_unique = [](string str) -> bool {
-    std::sort(str.begin(), str.end());
+    absl::c_sort(str);
     return std::unique(str.begin(), str.end()) == str.end();
   };
 
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index bc1a736766abc2fe0746b34b97fd0a5a17de462d..76b8a5bc117e653b3b2c09c7c95f26f1d4f27c7b 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -566,12 +566,26 @@ ENTRY %DynamicSlice.v5 (original_parameter: s32[2,2,258], start_index: s32[1]) -
   ROOT %dynamic-slice = s32[2,2,258]{2,1,0} dynamic-slice(s32[2,2,258]{2,1,0} %original_parameter, s32[3]{0} %concatenate), dynamic_slice_sizes={2,2,258}
 }
 
+)"
+},
+// Dynamic slice with scalar indices
+{
+"DynamicSliceScalarIndices",
+R"(HloModule DynamicSlice_module
+
+ENTRY %DynamicSlice.v5 (original_parameter: s32[2,2,258], start_index: s32[]) -> s32[2,2,258] {
+  %original_parameter = s32[2,2,258]{2,1,0} parameter(0)
+  %constant = s32[] constant(0)
+  %start_index = s32[] parameter(1)
+  ROOT %dynamic-slice = s32[2,2,258]{2,1,0} dynamic-slice(s32[2,2,258]{2,1,0} %original_parameter, s32[] %constant, s32[] %constant, s32[] %start_index), dynamic_slice_sizes={2,2,258}
+}
+
 )"
 },
 // Dynamic update slice
 {
 "DynamicUpdateSlice",
-R"(HloModule DynamicUpdateSlice_module
+R"(HloModule DynamicSlice_module
 
 ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_indices: s32[4]) -> s32[1,1,25,1] {
   %input = s32[1,1,25,1]{3,2,1,0} parameter(0)
@@ -580,6 +594,23 @@ ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_
   ROOT %dynamic-update-slice = s32[1,1,25,1]{3,2,1,0} dynamic-update-slice(s32[1,1,25,1]{3,2,1,0} %input, s32[1,1,2,1]{3,2,1,0} %update, s32[4]{0} %start_indices)
 }
 
+)"
+},
+// Dynamic update slice with scalar indices
+{
+"DynamicUpdateSliceScalarIndex",
+R"(HloModule DynamicUpdateSlice_module
+
+ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_index.0: s32[], start_index.1: s32[], start_index.2: s32[], start_index.3: s32[]) -> s32[1,1,25,1] {
+  %input = s32[1,1,25,1]{3,2,1,0} parameter(0)
+  %update = s32[1,1,2,1]{3,2,1,0} parameter(1)
+  %start_index.0 = s32[] parameter(2)
+  %start_index.1 = s32[] parameter(3)
+  %start_index.2 = s32[] parameter(4)
+  %start_index.3 = s32[] parameter(5)
+  ROOT %dynamic-update-slice = s32[1,1,25,1]{3,2,1,0} dynamic-update-slice(s32[1,1,25,1]{3,2,1,0} %input, s32[1,1,2,1]{3,2,1,0} %update, s32[] %start_index.0, s32[] %start_index.1, s32[] %start_index.2, s32[] %start_index.3)
+}
+
 )"
 },
 // batch norm training
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index 33ce7e23a82d840676bba5f1ca9c0ffc4433465d..ae8c08cf1d16ad6738962f3be7c1b5512110b1d1 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -89,7 +89,7 @@ std::vector<HloPassInterface*> HloPassPipeline::GetEnabledPasses(
 
   std::vector<HloPassInterface*> enabled_passes;
   for (auto& pass : passes_) {
-    if (disabled_pass_names.count(string(pass->name())) == 0) {
+    if (!disabled_pass_names.contains(pass->name())) {
       enabled_passes.push_back(pass.get());
     }
   }
diff --git a/tensorflow/compiler/xla/service/hlo_profile_printer.cc b/tensorflow/compiler/xla/service/hlo_profile_printer.cc
index 5eb707a957e49d86cdb2f72b72ce750bf29b8fd2..9cc202aa9f5fe5a20a9da05251ea811137ccaadb 100644
--- a/tensorflow/compiler/xla/service/hlo_profile_printer.cc
+++ b/tensorflow/compiler/xla/service/hlo_profile_printer.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_profile_printer.h"
 
+#include "absl/algorithm/container.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/human_readable_profile_builder.h"
 
@@ -34,11 +35,10 @@ string PrintHloProfile(const HloProfilePrinterData& hlo_profile_printer_data,
   for (const HloComputationInfo& computation_info :
        hlo_profile_printer_data.computation_infos()) {
     const auto& instruction_infos = computation_info.instruction_infos();
-    bool any_instruction_profiled =
-        std::any_of(instruction_infos.begin(), instruction_infos.end(),
-                    [&](const HloInstructionInfo& instruction_info) {
-                      return counters[instruction_info.profile_index()] != 0;
-                    });
+    bool any_instruction_profiled = absl::c_any_of(
+        instruction_infos, [&](const HloInstructionInfo& instruction_info) {
+          return counters[instruction_info.profile_index()] != 0;
+        });
 
     if (!any_instruction_profiled) {
       continue;
diff --git a/tensorflow/compiler/xla/service/hlo_reachability.cc b/tensorflow/compiler/xla/service/hlo_reachability.cc
index edaa4c59e2674e5f165c468059747d3dd2d54218..0fced7f15bdaf1dbe349e3b0fc6ada68393c6512 100644
--- a/tensorflow/compiler/xla/service/hlo_reachability.cc
+++ b/tensorflow/compiler/xla/service/hlo_reachability.cc
@@ -49,7 +49,7 @@ void HloReachabilityMap::SetReachabilityToUnionHelper(
     absl::Span<const HloInstruction* const> inputs,
     const HloInstruction* instruction, BitVector* bit_vector) {
   // If instruction is part of inputs, don't reset the bit_vector.
-  if (std::find(inputs.begin(), inputs.end(), instruction) == inputs.end()) {
+  if (!absl::c_linear_search(inputs, instruction)) {
     bit_vector->SetToZero();
   }
   bit_vector->Set(GetIndex(instruction));
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index ac74e2432f2176e13eaf7d4a1934a50ee89d1042..9ca14ca18a1c47f3975cfdb57a03f3b6f03379df 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -235,8 +235,7 @@ class InstructionList {
     }
 
     // Now scan forwards until we find one of the before_instructions.
-    while (std::find(before_instructions.begin(), before_instructions.end(),
-                     min_position_item) == before_instructions.end()) {
+    while (!absl::c_linear_search(before_instructions, min_position_item)) {
       min_position_item = min_position_item->next;
     }
     return InsertBefore(to_insert, min_position_item);
@@ -302,7 +301,7 @@ ItemList GetUsers(const InstructionList& instruction_list,
       // A buffer may be used by the instruction via more than one alias. For
       // example, a buffer which appears in more than one element of a tuple.
       Item* user_item = instruction_list.GetItem(user);
-      if (std::find(users.begin(), users.end(), user_item) == users.end()) {
+      if (!absl::c_linear_search(users, user_item)) {
         users.push_back(user_item);
       }
     }
@@ -456,8 +455,7 @@ class MemoryUsageTracker {
       return false;
     }
     const BufferIdList& in_progress_uses = in_progress_item_->buffers_used;
-    return std::find(in_progress_uses.begin(), in_progress_uses.end(),
-                     buffer_id) != in_progress_uses.end();
+    return absl::c_linear_search(in_progress_uses, buffer_id);
   }
 
   // Returns whether the given instruction is live at the current program
@@ -535,8 +533,7 @@ MemoryUsageTracker::MemoryUsageTracker(
         bool unused;
         for (Item* user_item : GetUsers(instruction_list_, logical_buffer,
                                         points_to_analysis, &unused)) {
-          if (std::find(buffer->users.begin(), buffer->users.end(),
-                        user_item) == buffer->users.end()) {
+          if (!absl::c_linear_search(buffer->users, user_item)) {
             buffer->users.push_back(user_item);
             buffer->unfinished_user_count++;
             user_item->buffers_used.push_back(buffer->id);
@@ -784,8 +781,7 @@ bool MemoryUsageTracker::Check() const {
 
     for (const Buffer& buffer : buffers_) {
       if (buffer.defining_instruction->instruction == instruction) {
-        CHECK(std::find(defined_buffers.begin(), defined_buffers.end(),
-                        buffer.id) != defined_buffers.end())
+        CHECK(absl::c_linear_search(defined_buffers, buffer.id))
             << "Instruction " << instruction->name()
             << " defined buffers is missing: " << buffer.ToString();
       }
@@ -808,8 +804,7 @@ bool MemoryUsageTracker::Check() const {
     int64 unfinished_uses = 0;
     for (Item* user : buffer.users) {
       const BufferIdList& used_buffers = user->buffers_used;
-      CHECK(std::find(used_buffers.begin(), used_buffers.end(), buffer.id) !=
-            used_buffers.end())
+      CHECK(absl::c_linear_search(used_buffers, buffer.id))
           << "Instruction " << user->instruction->name()
           << " used buffers is missing " << buffer.ToString();
       if (!IsFinished(user)) {
@@ -836,10 +831,10 @@ int64 RematerializationCost(const HloInstruction* instruction,
   // If none of the users of 'instruction' have been placed in the sequence (as
   // tracked by memory_tracker), then rematerialization of 'instruction' is a
   // zero-cost move of 'instruction' in the sequence.
-  if (!std::any_of(instruction->users().begin(), instruction->users().end(),
-                   [&memory_tracker](const HloInstruction* inst) {
-                     return memory_tracker.IsPlaced(inst);
-                   })) {
+  if (!absl::c_any_of(instruction->users(),
+                      [&memory_tracker](const HloInstruction* inst) {
+                        return memory_tracker.IsPlaced(inst);
+                      })) {
     return 0;
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 5a9b820a9d7f58695383b21c9e2126cf98970c83..d7d66ae1c4592723ca991d5ee971fa72cc1af90a 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -383,9 +383,7 @@ ServiceExecutableRunOptions HloRunner::GetServiceRunOptionsForDevice(
   if (device_assignment != nullptr) {
     run_options.set_device_assignment(device_assignment);
   }
-  return ServiceExecutableRunOptions(
-      run_options, backend().StreamBorrower(),
-      /*xla_intra_op_thread_pool=*/backend().eigen_intra_op_thread_pool());
+  return ServiceExecutableRunOptions(run_options, backend().StreamBorrower());
 }
 
 Backend& HloRunner::backend() {
diff --git a/tensorflow/compiler/xla/service/hlo_schedule.cc b/tensorflow/compiler/xla/service/hlo_schedule.cc
index 8f6eb974c5179b420c8f961393ca923e0a3b3530..e75373501cffac6a736be89e9f6139b6ff2cdbc1 100644
--- a/tensorflow/compiler/xla/service/hlo_schedule.cc
+++ b/tensorflow/compiler/xla/service/hlo_schedule.cc
@@ -140,7 +140,7 @@ Status HloSchedule::UpdateComputationSchedule(
   std::queue<HloInstruction*> worklist;
 
   for (HloInstruction* instruction : computation->instructions()) {
-    if (ids_in_schedule.count(instruction->unique_id()) == 0) {
+    if (!ids_in_schedule.contains(instruction->unique_id())) {
       // This is a newly added instruction which is not in the schedule.
       if (instruction->operands().empty()) {
         worklist.push(instruction);
@@ -204,7 +204,7 @@ Status HloSchedule::Update() {
   std::vector<HloComputation*> nonfusion_computations =
       module_->MakeNonfusionComputations();
   for (const HloComputation* computation : nonfusion_computations) {
-    TF_RET_CHECK(sequences_.count(computation->unique_id()) == 1)
+    TF_RET_CHECK(sequences_.contains(computation->unique_id()))
         << "Computation " << computation->name() << " not in HloSchedule.";
   }
   if (sequences_.size() > nonfusion_computations.size()) {
@@ -215,7 +215,7 @@ Status HloSchedule::Update() {
       nonfusion_computations_ids.insert(computation->unique_id());
     }
     for (auto it = sequences_.begin(); it != sequences_.end();) {
-      if (nonfusion_computations_ids.count(it->first) == 0) {
+      if (!nonfusion_computations_ids.contains(it->first)) {
         sequences_.erase(it++);
       } else {
         ++it;
@@ -244,7 +244,7 @@ Status HloSchedule::Verify() const {
       << "Schedule has " << sequences_.size() << " sequences, but module has "
       << nonfusion_computations.size() << " non-fusion computations";
   for (const HloComputation* computation : nonfusion_computations) {
-    TF_RET_CHECK(sequences_.count(computation->unique_id()) == 1)
+    TF_RET_CHECK(sequences_.contains(computation->unique_id()))
         << "Computation " << computation->name()
         << " missing from HLO schedule.";
   }
@@ -268,7 +268,7 @@ Status HloSchedule::Verify() const {
         << instruction_position.size() << " instructions, expected "
         << computation->instruction_count();
     for (const HloInstruction* instruction : computation->instructions()) {
-      TF_RET_CHECK(instruction_position.count(instruction) == 1)
+      TF_RET_CHECK(instruction_position.contains(instruction))
           << "Instruction " << instruction->name() << " is not in schedule";
     }
 
diff --git a/tensorflow/compiler/xla/service/hlo_schedule.h b/tensorflow/compiler/xla/service/hlo_schedule.h
index 486ddbf499de80c634bc497158cd79ca066cc866..a5f54ae2c33259d080631061dff9ae40b41495dc 100644
--- a/tensorflow/compiler/xla/service/hlo_schedule.h
+++ b/tensorflow/compiler/xla/service/hlo_schedule.h
@@ -110,7 +110,7 @@ class HloSchedule {
 
   // Returns true if the schedule has a sequence for the given computation.
   bool is_computation_scheduled(const HloComputation* computation) const {
-    return sequences_.count(computation->unique_id()) == 1;
+    return sequences_.contains(computation->unique_id());
   }
 
   // Updates the schedule such that it is (again) a valid schedule for the
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index bdd6ec1169a7d89cb4978a624d5606a5fb89df0f..37cc146bd7a6f2aef9373bd4afd8572ffac6473c 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/overflow_util.h"
@@ -106,13 +107,12 @@ string HloSharding::ToString() const {
 
 bool HloSharding::UsesDevice(int64 device) const {
   if (IsTuple()) {
-    return std::any_of(
-        tuple_elements_.begin(), tuple_elements_.end(),
-        [&](const HloSharding& s) { return s.UsesDevice(device); });
+    return absl::c_any_of(tuple_elements_, [&](const HloSharding& s) {
+      return s.UsesDevice(device);
+    });
   }
   const auto& devices = tile_assignment_;
-  return replicated_ ||
-         std::find(devices.begin(), devices.end(), device) != devices.end();
+  return replicated_ || absl::c_linear_search(devices, device);
 }
 
 std::map<int64, int64> HloSharding::UsedDevices(int64* count) const {
@@ -316,7 +316,7 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
   // All tile assignments must be less than the number of available cores and
   // unique.
   Status status = Status::OK();
-  std::set<int64> seen_cores;
+  absl::flat_hash_set<int64> seen_cores;
   tile_assignment_.Each(
       [&](absl::Span<const int64> indices, int32 core) {
         // Don't overwrite a bad status, so we report the first error.
@@ -324,7 +324,7 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
           if (core >= num_devices) {
             status = tensorflow::errors::InvalidArgument(StrCat(
                 "core ", core, " > ", num_devices, " in tile assignment"));
-          } else if (seen_cores.count(core) != 0) {
+          } else if (seen_cores.contains(core)) {
             status = tensorflow::errors::InvalidArgument(
                 StrCat("core ", core, " is not unique in tile assignment"));
           }
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h
index 9775505f8608ced3e33abe376f4922cc6a972726..5789ae09988d2a85247c5b8c037a172b3699f3b7 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding.h
@@ -101,8 +101,8 @@ class HloSharding {
     if (!IsTuple()) {
       return replicated_;
     }
-    return std::all_of(tuple_elements_.begin(), tuple_elements_.end(),
-                       [](const HloSharding& s) { return s.IsReplicated(); });
+    return absl::c_all_of(
+        tuple_elements_, [](const HloSharding& s) { return s.IsReplicated(); });
   }
 
   // Returns true if the tile size is the same as the input size.
@@ -110,8 +110,9 @@ class HloSharding {
     if (!IsTuple()) {
       return maximal_;
     }
-    return std::all_of(tuple_elements_.begin(), tuple_elements_.end(),
-                       [](const HloSharding& s) { return s.IsTileMaximal(); });
+    return absl::c_all_of(tuple_elements_, [](const HloSharding& s) {
+      return s.IsTileMaximal();
+    });
   }
 
   // Returns true if the sharding defines an operation on the given device.
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
index b414d2a66328284a4d8be8d35206bb837a2b3a58..094d98bc6e54028557f6d38cd165bf34e1fb8c46 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
@@ -99,7 +99,7 @@ std::vector<PassThrough> LocatePassThroughDomainLinks(
         << "Instruction is not a kDomain: " << instruction->ToString();
     for (HloInstruction* user : instruction->users()) {
       if (user->opcode() == HloOpcode::kDomain &&
-          domain.exit_domains.count(user) != 0) {
+          domain.exit_domains.contains(user)) {
         pass_through.emplace_back(user, instruction);
         VLOG(2) << "Found passthrough domain link:";
         VLOG(2) << "  " << user->ToString();
@@ -253,7 +253,7 @@ StatusOr<bool> ApplyShardingFromUsers(HloInstruction* instruction,
       instruction->shape(), HloSharding::AssignDevice(kUnassignedDevice));
   for (HloInstruction* user : instruction->users()) {
     if (user->opcode() == HloOpcode::kDomain &&
-        domain.exit_domains.count(user) > 0) {
+        domain.exit_domains.contains(user)) {
       // If a user is a domain and it is registered in the domain exits, then
       // the instruction sharding is taken directly from the domain, and no
       // further users need to be visited.
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
index 4ea39a7628317867eb054bb14de48f721354cd9d..c1f69db74eafb7743e85f499f2f4828ed0375501 100644
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
+++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
@@ -61,8 +61,7 @@ void CleanNodeName(string* name) {
   name->erase(std::remove(name->begin(), name->end(), '%'), name->end());
   const string chars_to_replace = "<>[]";
   auto pred = [&](char c) {
-    return std::find(chars_to_replace.begin(), chars_to_replace.end(), c) !=
-           chars_to_replace.end();
+    return absl::c_linear_search(chars_to_replace, c);
   };
   std::replace_if(name->begin(), name->end(), pred, '_');
 }
diff --git a/tensorflow/compiler/xla/service/hlo_value.cc b/tensorflow/compiler/xla/service/hlo_value.cc
index d409df06be44a79d1bf7c90ce3ff34fca975fda4..218b33b2ac2b86edc30b2f014ba206c71da37682 100644
--- a/tensorflow/compiler/xla/service/hlo_value.cc
+++ b/tensorflow/compiler/xla/service/hlo_value.cc
@@ -209,7 +209,7 @@ std::ostream& operator<<(std::ostream& out, const HloValue& value) {
 }
 
 void HloValueSet::SortAndUniquifyValues() {
-  std::sort(values_.begin(), values_.end(), HloValue::IdLessThan);
+  absl::c_sort(values_, HloValue::IdLessThan);
   values_.erase(std::unique(values_.begin(), values_.end(), HloValue::IdEqual),
                 values_.end());
 }
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 8d8720d7be58b7f02dbeba7b2394956b6e93a43f..2c69c27ce55e0f9f6d185f807e6d43d6f2cfe8d4 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -387,6 +387,14 @@ Status ShapeVerifier::HandleReduce(HloInstruction* reduce) {
 
 Status ShapeVerifier::HandleBitcast(HloInstruction* bitcast) {
   TF_RETURN_IF_ERROR(CheckOperandCount(bitcast, 1));
+  // Bitcasts are not allowed to change the element type.
+  if (bitcast->operand(0)->shape().element_type() !=
+      bitcast->shape().element_type()) {
+    return InternalError(
+        "Bitcast can not change the element type from %s to %s",
+        PrimitiveType_Name(bitcast->operand(0)->shape().element_type()),
+        PrimitiveType_Name(bitcast->shape().element_type()));
+  }
   return Status::OK();
 }
 
@@ -496,21 +504,23 @@ Status ShapeVerifier::HandleSlice(HloInstruction* slice) {
 }
 
 Status ShapeVerifier::HandleDynamicSlice(HloInstruction* dynamic_slice) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(dynamic_slice, 2));
-  return CheckShape(dynamic_slice, ShapeInference::InferDynamicSliceShape(
-                                       dynamic_slice->operand(0)->shape(),
-                                       dynamic_slice->operand(1)->shape(),
-                                       dynamic_slice->dynamic_slice_sizes()));
+  return CheckShape(
+      dynamic_slice,
+      ShapeInference::InferDynamicSliceShape(
+          dynamic_slice->operand(0)->shape(),
+          Cast<HloDynamicSliceInstruction>(dynamic_slice)->index_shapes(),
+          dynamic_slice->dynamic_slice_sizes()));
 }
 
 Status ShapeVerifier::HandleDynamicUpdateSlice(
     HloInstruction* dynamic_update_slice) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(dynamic_update_slice, 3));
-  return CheckShape(dynamic_update_slice,
-                    ShapeInference::InferDynamicUpdateSliceShape(
-                        dynamic_update_slice->operand(0)->shape(),
-                        dynamic_update_slice->operand(1)->shape(),
-                        dynamic_update_slice->operand(2)->shape()));
+  return CheckShape(
+      dynamic_update_slice,
+      ShapeInference::InferDynamicUpdateSliceShape(
+          dynamic_update_slice->operand(0)->shape(),
+          dynamic_update_slice->operand(1)->shape(),
+          Cast<HloDynamicUpdateSliceInstruction>(dynamic_update_slice)
+              ->index_shapes()));
 }
 
 Status ShapeVerifier::HandleTuple(HloInstruction* tuple) {
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index a1a6aba9728c137d17487b5914f67cb3966fc12b..479905b317d5639ff2cebc4d1044e21b527693f6 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -168,8 +168,13 @@ class ShapeVerifier : public DfsHloVisitor {
 // An interface used to encapsulate target-specific verification quirks.
 class TargetVerifierMetadata {
  public:
+  TargetVerifierMetadata(std::function<int64(const Shape&)> shape_size_function)
+      : shape_size_function_(shape_size_function) {}
+
   // Returns a target-specific shape size.
-  virtual int64 ShapeSize(const Shape& shape) const = 0;
+  int64 ShapeSize(const Shape& shape) const {
+    return shape_size_function_(shape);
+  }
 
   virtual std::unique_ptr<ShapeVerifier> GetVerifier() const = 0;
 
@@ -178,20 +183,23 @@ class TargetVerifierMetadata {
 
   TargetVerifierMetadata(const TargetVerifierMetadata&) = delete;
   TargetVerifierMetadata& operator=(const TargetVerifierMetadata&) = delete;
+
+ private:
+  // Returns a target-specific shape size.
+  std::function<int64(const Shape&)> shape_size_function_;
 };
 
 // The default implementation of TargetVerifierMetadata, used unless the target
 // needs to override it.
 class DefaultVerifierMetadata : public TargetVerifierMetadata {
  public:
-  DefaultVerifierMetadata(bool layout_sensitive, bool allow_mixed_precision)
-      : layout_sensitive_(layout_sensitive),
+  DefaultVerifierMetadata(
+      bool layout_sensitive, bool allow_mixed_precision,
+      std::function<int64(const Shape&)> shape_size_function)
+      : TargetVerifierMetadata(shape_size_function),
+        layout_sensitive_(layout_sensitive),
         allow_mixed_precision_(allow_mixed_precision) {}
 
-  int64 ShapeSize(const Shape& shape) const override {
-    return ShapeUtil::ByteSizeOf(shape);
-  }
-
   // Creates a ShapeVerifier that checks that shapes match inferred
   // expectations. This creates a new verifier every time because ShapeVerifier,
   // being a DfsHloVisitor, is stateful. We want a clean object for each run of
@@ -210,11 +218,14 @@ class DefaultVerifierMetadata : public TargetVerifierMetadata {
 // the module.
 class HloVerifier : public HloModulePass {
  public:
-  explicit HloVerifier(bool layout_sensitive, bool allow_mixed_precision,
-                       std::function<bool(const HloInstruction*)>
-                           instruction_can_change_layout_func = {})
+  explicit HloVerifier(
+      bool layout_sensitive, bool allow_mixed_precision,
+      std::function<bool(const HloInstruction*)>
+          instruction_can_change_layout_func = {},
+      std::function<int64(const Shape&)> shape_size_func =
+          [](const Shape& shape) { return ShapeUtil::ByteSizeOf(shape); })
       : target_metadata_(absl::make_unique<DefaultVerifierMetadata>(
-            layout_sensitive, allow_mixed_precision)),
+            layout_sensitive, allow_mixed_precision, shape_size_func)),
         instruction_can_change_layout_func_(
             std::move(instruction_can_change_layout_func)) {
     CHECK(instruction_can_change_layout_func_ == nullptr || layout_sensitive);
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index 4bc557e4e62e7df4e25fda86fe417e84129b464c..d27c62a879bedf316508b6ff95ab2536106b40d0 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/layout_assignment.h"
@@ -27,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
@@ -386,6 +388,55 @@ TEST_F(HloVerifierTest, AddWithLayoutChange) {
   ASSERT_TRUE(status.ok());
 }
 
+TEST_F(HloVerifierTest, ScalarIndexDynamicSlice) {
+  const char* const kScalarIndexDynamicSlice = R"(
+    HloModule DynamicSlice_module
+
+    ENTRY %DynamicSlice.v5 (original_parameter: s32[2,2,258], start_index: s32[]) -> s32[2,2,258] {
+      %original_parameter = s32[2,2,258] parameter(0)
+      %constant = s32[] constant(0)
+      %start_index = s32[] parameter(1)
+      ROOT %dynamic-slice = s32[2,2,258] dynamic-slice(s32[2,2,258] %original_parameter, s32[] %constant, s32[] %constant, s32[] %start_index), dynamic_slice_sizes={2,2,258}
+    }
+  )";
+
+  HloModuleConfig config;
+  DebugOptions debug_options = config.debug_options();
+  debug_options.set_xla_allow_scalar_index_dynamic_ops(true);
+  config.set_debug_options(debug_options);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseHloString(kScalarIndexDynamicSlice, config));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
+TEST_F(HloVerifierTest, ScalarIndexDynamicUpdateSlice) {
+  const char* const kScalarIndexDynamicSlice = R"(
+    HloModule DynamicUpdateSlice_module
+
+    ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_index.0: s32[], start_index.1: s32[], start_index.2: s32[], start_index.3: s32[]) -> s32[1,1,25,1] {
+      %input = s32[1,1,25,1]{3,2,1,0} parameter(0)
+      %update = s32[1,1,2,1]{3,2,1,0} parameter(1)
+      %start_index.0 = s32[] parameter(2)
+      %start_index.1 = s32[] parameter(3)
+      %start_index.2 = s32[] parameter(4)
+      %start_index.3 = s32[] parameter(5)
+      ROOT %dynamic-update-slice = s32[1,1,25,1]{3,2,1,0} dynamic-update-slice(s32[1,1,25,1]{3,2,1,0} %input, s32[1,1,2,1]{3,2,1,0} %update, s32[] %start_index.0, s32[] %start_index.1, s32[] %start_index.2, s32[] %start_index.3)
+    }
+  )";
+
+  HloModuleConfig config;
+  DebugOptions debug_options = config.debug_options();
+  debug_options.set_xla_allow_scalar_index_dynamic_ops(true);
+  config.set_debug_options(debug_options);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseHloString(kScalarIndexDynamicSlice, config));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
 TEST_F(HloVerifierTestLayoutSensitive, AddWithLayoutChangeNotAllowed) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(kAddWithLayoutChangeHlo));
   auto status = verifier().Run(module.get()).status();
@@ -399,8 +450,9 @@ TEST_F(HloVerifierTestLayoutSensitive, SliceWithLayoutChangeNotAllowed) {
    HloModule SliceWithLayoutChange
     ENTRY SliceWithLayoutChange {
       par0 = f32[4,5]{0,1} parameter(0)
-      par1 = s32[2] parameter(1)
-      ROOT dslice0 = f32[3,4]{1,0} dynamic-slice(par0, par1),
+      par1 = s32[] parameter(1)
+      par2 = s32[] parameter(2)
+      ROOT dslice0 = f32[3,4]{1,0} dynamic-slice(par0, par1, par2),
         dynamic_slice_sizes={3,4}
     }
   )";
@@ -429,5 +481,23 @@ TEST_F(HloVerifierTestLayoutSensitive, ConcatWithLayoutChangeNotAllowed) {
   EXPECT_THAT(status.error_message(),
               HasSubstr("Instruction shouldn't change layouts"));
 }
+
+TEST_F(HloVerifierTest, BitcastCanNotChangeElementType) {
+  const char* const hlo_string = R"(
+  HloModule Module
+
+  ENTRY BitcastCanNotChangeElementType {
+   constant.0 = f32[2] constant({0.0, 0.0})
+   ROOT bitcast = s32[2] bitcast(constant.0)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("Bitcast can not change the element type"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
index 90904ac00110457bcc3b8974816a7080c4ab89fc..88fc62bd1e2a7830b3f61738a8642308ef4225a7 100644
--- a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
+++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
@@ -128,9 +128,9 @@ string HumanReadableProfileBuilder::ToString() const {
 
   // Sort ops in decreasing order of cycles, and print them.
   std::vector<OpInfo> sorted_ops(op_infos_);
-  std::sort(
-      sorted_ops.begin(), sorted_ops.end(),
-      [](const OpInfo& a, const OpInfo& b) { return a.cycles > b.cycles; });
+  absl::c_sort(sorted_ops, [](const OpInfo& a, const OpInfo& b) {
+    return a.cycles > b.cycles;
+  });
   for (const auto& op : sorted_ops) {
     print_op(op);
   }
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis.cc b/tensorflow/compiler/xla/service/indexed_array_analysis.cc
index a41cf714c5ed6adcf7aa1f0e54cf052594834b5d..76bf48870d55e82497ba5f63e9e2e2a322cb330e 100644
--- a/tensorflow/compiler/xla/service/indexed_array_analysis.cc
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis.cc
@@ -103,7 +103,7 @@ Status IndexedArrayAnalysis::TraverseAndPopulateCache(
 
   do {
     const HloInstruction* instr = stack.back();
-    if (cache_.count(instr)) {
+    if (cache_.contains(instr)) {
       stack.pop_back();
       continue;
     }
@@ -111,9 +111,9 @@ Status IndexedArrayAnalysis::TraverseAndPopulateCache(
     switch (FindOrDie(dfs_state_map, instr)) {
       case kDiscovered: {
         for (const HloInstruction* operand : instr->operands()) {
-          if (!cache_.count(operand)) {
+          if (!cache_.contains(operand)) {
             stack.push_back(operand);
-            CHECK(!dfs_state_map.count(operand) ||
+            CHECK(!dfs_state_map.contains(operand) ||
                   dfs_state_map[operand] == kDiscovered);
             dfs_state_map[operand] = kDiscovered;
           }
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index f6f25c44131226d35f8e927d62defee291bf46dd..b97060535d998e174639dceca5cde517cef01e30 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -178,19 +178,18 @@ bool InstructionFusion::EffectivelyAtMostUnary(HloInstruction* hlo) {
           output_rank = std::max(output_rank, ShapeUtil::TrueRank(subshape));
         }
       });
-  return std::count_if(hlo->operands().begin(), hlo->operands().end(),
-                       [output_rank](HloInstruction* operand) {
-                         if (operand->opcode() == HloOpcode::kBroadcast ||
-                             operand->opcode() == HloOpcode::kIota) {
-                           return false;
-                         }
-                         if (operand->opcode() == HloOpcode::kConstant &&
-                             ShapeUtil::IsEffectiveScalar(operand->shape())) {
-                           return false;
-                         }
-                         return ShapeUtil::TrueRank(operand->shape()) >=
-                                output_rank;
-                       }) <= 1;
+  return absl::c_count_if(
+             hlo->operands(), [output_rank](HloInstruction* operand) {
+               if (operand->opcode() == HloOpcode::kBroadcast ||
+                   operand->opcode() == HloOpcode::kIota) {
+                 return false;
+               }
+               if (operand->opcode() == HloOpcode::kConstant &&
+                   ShapeUtil::IsEffectiveScalar(operand->shape())) {
+                 return false;
+               }
+               return ShapeUtil::TrueRank(operand->shape()) >= output_rank;
+             }) <= 1;
 }
 
 bool InstructionFusion::CanFuseOnAllPaths(
@@ -409,9 +408,8 @@ class ReversePostOrderFusionQueue : public FusionQueue {
       }
       sorted_operand_numbers.push_back(i);
     }
-    std::sort(
-        sorted_operand_numbers.begin(), sorted_operand_numbers.end(),
-        [&](int64 i, int64 j) {
+    absl::c_sort(
+        sorted_operand_numbers, [&](int64 i, int64 j) {
           // Instructions with higher priority in the queue come first.
           return (
               FindOrDie(post_order_index_, instruction->mutable_operand(i)) >
diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD
index a981d94a999e3d322986bc2bfd56a5b0b5d175fc..9b4cafa4b42213d589ff25ce73d0fdfecc678dc2 100644
--- a/tensorflow/compiler/xla/service/interpreter/BUILD
+++ b/tensorflow/compiler/xla/service/interpreter/BUILD
@@ -1,12 +1,12 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//visibility:public"])
-
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
     "if_static",
 )
 
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//visibility:public"])
+
 cc_library(
     name = "interpreter_transfer_manager",
     srcs = ["interpreter_transfer_manager.cc"],
@@ -32,8 +32,10 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
+        "//tensorflow/compiler/xla/service:batchnorm_expander",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:computation_placer",
+        "//tensorflow/compiler/xla/service:dynamic_index_splitter",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
         "//tensorflow/compiler/xla/service:hlo",
@@ -41,12 +43,14 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_cost_analysis",
         "//tensorflow/compiler/xla/service:hlo_cse",
         "//tensorflow/compiler/xla/service:hlo_dce",
+        "//tensorflow/compiler/xla/service:hlo_get_dimension_size_rewriter",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
         "//tensorflow/compiler/xla/service:hlo_subcomputation_unification",
         "//tensorflow/compiler/xla/service:layout_assignment",
         "//tensorflow/compiler/xla/service:map_inliner",
+        "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:while_loop_simplifier",
         "//tensorflow/core:lib",
@@ -115,6 +119,8 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_headers_lib",
+        "//tensorflow/stream_executor/host:host_stream",
+        "//tensorflow/stream_executor/host:host_timer",
         "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index d37ae94bf6c4c697bbf30390c02a5099271e00a4..4818b2dae0a9951346600a9b2906488c3ef7e06e 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
+#include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
@@ -31,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/interpreter/executable.h"
 #include "tensorflow/compiler/xla/service/layout_assignment.h"
 #include "tensorflow/compiler/xla/service/map_inliner.h"
+#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -43,9 +45,15 @@ namespace interpreter {
 Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
   HloPassPipeline pipeline("Interpreter");
 
+  pipeline.AddPass<DynamicIndexSplitter>();
   pipeline.AddPass<LayoutAssignment>(
       hlo_module->mutable_entry_computation_layout(),
       LayoutAssignment::InstructionCanChangeLayout);
+
+  ReducePrecisionInsertion::AddPasses(
+      &pipeline, hlo_module->config().debug_options(),
+      ReducePrecisionInsertion::PassTiming::BEFORE_OPTIMIZATION);
+
   return pipeline.Run(hlo_module).status();
 }
 
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index 88e33464a27137fd56021e232cc93a832d1a5b3f..7a6ebdef708bcc3a92fbd8618db0c42c35e6ce8b 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -68,6 +68,18 @@ StatusOr<ScopedShapedBuffer> InterpreterExecutable::ExecuteOnStream(
         "Mismatch between argument count and graph parameter count.");
   }
 
+  // Check that the args have the right shape.
+  for (int64 i = 0; i < computation->num_parameters(); ++i) {
+    const auto& expected_shape = computation->parameter_instruction(i)->shape();
+    const auto& actual_shape = arguments[i]->on_device_shape();
+    if (!ShapeUtil::Equal(expected_shape, actual_shape)) {
+      return InvalidArgument(
+          "Shape mismatch on parameter %d.  Expected %s, but was %s.", i,
+          ShapeUtil::HumanString(expected_shape),
+          ShapeUtil::HumanString(actual_shape));
+    }
+  }
+
   TF_ASSIGN_OR_RETURN(TransferManager * transfer_manager,
                       TransferManager::GetForPlatform(platform));
 
@@ -86,8 +98,8 @@ StatusOr<ScopedShapedBuffer> InterpreterExecutable::ExecuteOnStream(
   {
     tensorflow::mutex_lock lock(evaluator_lock_);
     evaluator_->ResetVisitStates();
-    TF_ASSIGN_OR_RETURN(result_literal, evaluator_->Evaluate<Literal>(
-                                            *computation, arg_literals));
+    TF_ASSIGN_OR_RETURN(result_literal,
+                        evaluator_->Evaluate(*computation, arg_literals));
   }
 
   // Transform the result literal back into a ShapedBuffer.
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 1c4ac178f0b9a2789d4e6d4554e3a56e0e7cf77f..10ff7bb6d46ee3b2cd1228b4b7a49269be8c65d3 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -147,12 +147,9 @@ bool LayoutConstraints::OperandBufferForwarded(
   PointsToSet::BufferSet* output_buffers = GetBufferSet(instruction);
   PointsToSet::BufferSet* operand_buffers =
       GetBufferSet(instruction->operand(operand_no));
-  for (const LogicalBuffer* output_buffer : *output_buffers) {
-    if (operand_buffers->count(output_buffer) > 0) {
-      return true;
-    }
-  }
-  return false;
+  return absl::c_any_of(*output_buffers, [&](const LogicalBuffer* b) {
+    return operand_buffers->count(b) > 0;
+  });
 }
 
 Status LayoutConstraints::SetBufferLayout(const Layout& layout,
@@ -1236,6 +1233,23 @@ Status LayoutAssignment::PropagateUseConstraintToDefs(
       });
 }
 
+namespace {
+// A transpose or a reshape that only changes trivial dimensions have meaningful
+// layouts that are valuable to propagate in a depthfirst manner to avoid
+// unassigned layouts in the graph.
+bool InstructionShouldPropagateDepthFirst(const HloInstruction& hlo) {
+  switch (hlo.opcode()) {
+    case HloOpcode::kReshape:
+      return std::get<0>(hlo.ReshapeMerelyInsertsOrDeletes1SizedDimensions());
+    case HloOpcode::kTranspose:
+      return true;
+    default:
+      return false;
+  }
+}
+
+}  // namespace
+
 Status LayoutAssignment::PropagateOperandConstraint(
     const OperandLayoutConstraint& operand_constraint,
     LayoutConstraints* constraints) {
@@ -1370,7 +1384,7 @@ Status LayoutAssignment::PropagateOperandConstraint(
             TF_RETURN_IF_ERROR(constraints->SetBufferLayout(
                 *layout, *buffer,
                 /*mandatory=*/user->opcode() == HloOpcode::kReduce,
-                /*dfs=*/false));
+                /*dfs=*/InstructionShouldPropagateDepthFirst(*user)));
           }
         }
         return Status::OK();
@@ -1420,11 +1434,9 @@ Status LayoutAssignment::PropagateBufferConstraintToOperands(
             ChooseOperandLayoutFromOutputLayout(buffer_constraint.layout(),
                                                 instruction, operand_no);
         if (operand_layout != nullptr) {
-          // Do not propagate operand constraints of transposes and reshapes, it
-          // tends to create really bad layouts.
           TF_RETURN_IF_ERROR(constraints->SetArrayOperandLayout(
               *operand_layout, instruction, operand_no, /*mandatory=*/false,
-              /*dfs=*/false));
+              /*dfs=*/InstructionShouldPropagateDepthFirst(*instruction)));
         }
       } else {
         VLOG(6) << "Operand already has a constraint "
@@ -2120,7 +2132,7 @@ Status LayoutAssignment::ClearPreviousPassSideEffects(HloModule* module) {
     for (HloInstruction* instruction :
          computation->MakeInstructionPostOrder()) {
       if (instruction->opcode() == HloOpcode::kCopy &&
-          added_copies_.count(instruction) > 0) {
+          added_copies_.contains(instruction)) {
         VLOG(5) << "Removing added copy: " << instruction->ToString();
         TF_RETURN_IF_ERROR(
             instruction->ReplaceAllUsesWith(instruction->mutable_operand(0)));
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index 3b081de3c7826c3c11a7d87d542835d0ecce1b7e..5701cb5b025e563247d46d0d24f81a5f886fc23b 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -243,7 +243,7 @@ class ChannelLayoutConstraints {
 
   // Returns true if channel_id has a layout constraint.
   bool IsChannelConstrained(int64 channel_id) const {
-    return constraints_.count(channel_id) > 0;
+    return constraints_.contains(channel_id);
   }
 
   // Given `shape`, apply the layout for `channel_id`. `channel_id` must already
@@ -276,7 +276,7 @@ class ChannelLayoutConstraints {
   }
 
  private:
-  std::unordered_map<int64, Layout> constraints_;
+  absl::flat_hash_map<int64, Layout> constraints_;
 };
 
 // HLO pass which assigns layouts to all instructions in the HLO module while
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 387b385157ab6ece65c692de65b4da33038f1f30..c8cf3c47d380012fdb0206c0d20d67e6a13017ae 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -960,8 +960,9 @@ TEST_F(LayoutAssignmentTest, CopyDSliceOperandToAvoidImplicitLayoutChange) {
     ENTRY CopyDSliceOperandToAvoidImplicitLayoutChange {
       par0 = f32[3,4]{1,0} parameter(0)
       par1 = f32[4,5]{0,1} parameter(1)
-      par2 = s32[2] parameter(2)
-      dslice0 = f32[3,4] dynamic-slice(par1, par2), dynamic_slice_sizes={3,4}
+      par2 = s32[] parameter(2)
+      par3 = s32[] parameter(3)
+      dslice0 = f32[3,4] dynamic-slice(par1, par2, par3), dynamic_slice_sizes={3,4}
       ROOT add0 = f32[3,4]{1,0} add(par0,dslice0)
     }
   )";
@@ -982,7 +983,7 @@ TEST_F(LayoutAssignmentTest, CopyDSliceOperandToAvoidImplicitLayoutChange) {
                   m::Parameter(),
                   m::DynamicSlice(
                       m::Copy(m::Parameter(1)).WithShapeEqualTo(&shape_copy),
-                      m::Parameter(2)))));
+                      m::Parameter(2), m::Parameter(3)))));
 }
 
 TEST_F(LayoutAssignmentTest, CopyConcatOperandToAvoidImplicitLayoutChange) {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index 728a66b388f0f9af480ff88b5e96990a26e36af5..54e8fe1947eea43b89b6fae70655a4f52ea8e69d 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -169,6 +169,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:elemental_ir_emitter",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
         "@llvm//:core",
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
index 643ecd0fbaa546c551097b29e74ccd49418e1466..ce3d922ca7a9bdea3a520959a8b8d284bc3e0d64 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
@@ -81,9 +81,7 @@ void AliasAnalysis::AddAliasingInformationToIrArray(const HloInstruction& hlo,
     if (hlo.opcode() == HloOpcode::kParameter) {
       const std::vector<HloInstruction*>& parameter_instructions =
           module_.entry_computation()->parameter_instructions();
-      if (std::find(parameter_instructions.begin(),
-                    parameter_instructions.end(),
-                    &hlo) != parameter_instructions.end()) {
+      if (absl::c_linear_search(parameter_instructions, &hlo)) {
         array->MarkInvariantOverWholeProgram(context_);
       }
     }
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h
index 2b46b3c3964b15548dbacc8b0ada0047a0fa85b6..12e2f449e23ac2511aac576fed893f5a9ef510c0 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h
@@ -76,15 +76,12 @@ class AliasAnalysis {
   // A map from a buffer slice to metadata corresponding to its alias.scope
   // metadata.  The index kParameterAliasSet is used to hold aliasing
   // information for parameters.
-  absl::flat_hash_map<BufferAllocation::Slice, llvm::MDNode*,
-                      BufferAllocation::Slice::Hasher>
+  absl::flat_hash_map<BufferAllocation::Slice, llvm::MDNode*>
       alias_scope_metadata_;
 
   // A map from a buffer slice to metadata corresponding to its noalias
   // metadata.
-  absl::flat_hash_map<BufferAllocation::Slice, llvm::MDNode*,
-                      BufferAllocation::Slice::Hasher>
-      noalias_metadata_;
+  absl::flat_hash_map<BufferAllocation::Slice, llvm::MDNode*> noalias_metadata_;
 };
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
index 1da77945328c42826c305b778b4964f53dece90d..c66eaec8fb0e4c03f6967fec0cf0ae9661cdf470 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
@@ -36,8 +36,10 @@ bool CanUpdateDynamicSliceInPlace(HloInstruction* dynamic_update_slice,
 // EmitFusedDynamicUpdateSliceInPlace.
 //
 // Emits a sequential loop if launch_dimensions is null.
+using IndexGenerator = std::function<StatusOr<llvm::Value*>(int64)>;
+
 static Status EmitDynamicUpdateSliceInPlaceImpl(
-    const Shape& update_shape, const ElementGenerator& start_indices_generator,
+    const Shape& update_shape, const IndexGenerator& start_indices_generator,
     bool is_signed, ElementGenerator update_array_generator,
     const IrArray& output_array, const gpu::LaunchDimensions* launch_dimensions,
     absl::string_view name, llvm::IRBuilder<>* b) {
@@ -47,8 +49,7 @@ static Status EmitDynamicUpdateSliceInPlaceImpl(
   const int64 rank = output_shape.rank();
   IrArray::Index start_index(b->getInt64Ty(), rank);
   for (int64 i = 0; i < rank; ++i) {
-    IrArray::Index dim_index({b->getInt64(i)});
-    TF_ASSIGN_OR_RETURN(start_index[i], start_indices_generator(dim_index));
+    TF_ASSIGN_OR_RETURN(start_index[i], start_indices_generator(i));
     llvm::Value* output_dim_size = llvm::ConstantInt::get(
         start_index[i]->getType(), output_shape.dimensions(i));
     llvm::Value* update_dim_size = llvm::ConstantInt::get(
@@ -112,9 +113,20 @@ Status EmitDynamicUpdateSliceInPlace(absl::Span<const IrArray> operand_arrays,
   Shape output_shape = output_array.GetShape();
   Shape update_shape = update_array.GetShape();
 
-  ElementGenerator start_indices_generator = [&](const IrArray::Index& index) {
-    return start_indices_array.EmitReadArrayElement(index, b);
-  };
+  IndexGenerator start_indices_generator;
+  // TODO(b/118437727): Remove the R1 path, and rename the variables.
+  if (start_indices_array.GetShape().rank() == 1) {
+    start_indices_generator = [&](int64 index) {
+      return start_indices_array.EmitReadArrayElement(
+          IrArray::Index({b->getInt64(index)}), b);
+    };
+  } else {
+    start_indices_generator = [&](int64 index) {
+      return operand_arrays[2 + index].EmitReadArrayElement(
+          IrArray::Index(b->getInt64Ty()), b);
+    };
+  }
+
   ElementGenerator update_array_generator = [&](const IrArray::Index& index) {
     return update_array.EmitReadArrayElement(index, b);
   };
@@ -165,8 +177,21 @@ static Status EmitFusedDynamicUpdateSliceInPlaceImpl(
                                elemental_emitter);
   TF_RETURN_IF_ERROR(dynamic_update_slice->Accept(&fused_emitter));
   ElementGenerator update_array_generator = fused_emitter.GetGenerator(update);
-  ElementGenerator start_indices_generator =
-      fused_emitter.GetGenerator(start_indices);
+
+  // TODO(b/118437727): Remove the R1 path, and rename the variables.
+  IndexGenerator start_indices_generator;
+  if (start_indices->shape().rank() == 1) {
+    start_indices_generator = [&](int64 index) {
+      return fused_emitter.GetGenerator(start_indices)(
+          IrArray::Index({b->getInt64(index)}));
+    };
+  } else {
+    start_indices_generator = [&](int64 index) {
+      ElementGenerator element_generator =
+          fused_emitter.GetGenerator(dynamic_update_slice->operand(2 + index));
+      return element_generator(IrArray::Index(b->getInt64Ty()));
+    };
+  }
 
   bool is_signed = ShapeUtil::ElementIsSigned(start_indices->shape());
   return EmitDynamicUpdateSliceInPlaceImpl(
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
index 03a475b40b5a1f5100091fc11d744792b641cb04..e440f05e2b2f0d4a2a4c7b326b4881183de4d235 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
@@ -35,7 +35,7 @@ using llvm_ir::IrArray;
 Status FusedIrEmitter::DefaultAction(HloInstruction* hlo) {
   indexed_generators_[hlo] =
       [=](const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-    if (generated_value_cache_[hlo].count(index.multidim()) > 0) {
+    if (generated_value_cache_[hlo].contains(index.multidim())) {
       llvm::Value* generated_value =
           generated_value_cache_[hlo][index.multidim()];
       llvm::BasicBlock* generated_value_bb = nullptr;
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
index 1b9c61f6700e2a1309b21e499f4a9e2439ed3702..e6d52a580c04a920d3f0e8ed6f39c1cae587cf1b 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <map>
 #include <unordered_map>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "llvm/IR/IRBuilder.h"
@@ -134,8 +135,9 @@ class FusedIrEmitter : public DfsHloVisitorWithDefault {
 
   // Cache of generated values, lest we regenerate an element of a node with
   // multiple outgoing edges
-  std::unordered_map<const HloInstruction*,
-                     std::map<std::vector<llvm::Value*>, llvm::Value*>>
+  absl::flat_hash_map<
+      const HloInstruction*,
+      absl::flat_hash_map<std::vector<llvm::Value*>, llvm::Value*>>
       generated_value_cache_;
 };
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
index d6d84994ee147f4b8c1a333b0eaccdf6e0a2219b..a483f7051f268edf621a861ee09fb8c2376a8fc6 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
@@ -189,6 +189,8 @@ class IrArray {
       return llvm::ConstantInt::get(index_type_, c);
     }
 
+    void ClearLinearIndex() { linear_ = nullptr; }
+
    private:
     // Changing the multi-dimensional index invalidates the linear index.
     std::vector<llvm::Value*>& mutable_multidim() {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
index cebbc4290163d4e98003cd7b5df6ec906509a446..cd8dd72cd775d5e0b52f96a2326367da0775e7eb 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
@@ -123,7 +123,8 @@ KernelMappingScheme::KernelMappingScheme(
       dims_in_elems_(dims_in_elems.begin(), dims_in_elems.end()),
       tile_sizes_{1, tile_size_y, tile_size_x},
       num_threads_x_(num_threads_x),
-      num_threads_y_(num_threads_y) {
+      num_threads_y_(num_threads_y),
+      dilated_x_(true) {
   DCHECK_EQ(dims_in_elems_.size(), 3);
   DCHECK_EQ(req_block_sizes.size(), 3);
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
index fb633b12e60d1a9f3103fb2919ad2c3f3f14de20..f802cc27d519e621262f328903697373aa8c284c 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
@@ -117,7 +117,10 @@ class KernelMappingScheme {
   int64 GetNumberOfTilesInOneBlock() const {
     return absl::c_accumulate(block_sizes_, 1, std::multiplies<int64>());
   }
-
+  int64 GetNumberOfTilesInOneBlockForDimension(int d) const {
+    DCHECK(d >= DimZ && d <= DimX);
+    return block_sizes_[d];
+  }
   int64 GetNumberOfBlocks() const {
     return absl::c_accumulate(dims_in_blocks_, 1, std::multiplies<int64>());
   }
@@ -147,6 +150,16 @@ class KernelMappingScheme {
            GetNumberOfThreadsForDimensionY();
   }
 
+  bool DilatedX() const { return dilated_x_; }
+  void SetDilatedX(bool v) {
+    dilated_x_ = v;
+    if (!dilated_x_) {
+      // dilated_x_=false is for the purpose of vectorization, which requires
+      // GetTileSizeForDimension(DimX) to be a multiplier of num_threads_x_.
+      CHECK_EQ(GetTileSizeForDimension(DimX) % num_threads_x_, 0);
+    }
+  }
+
   IrArray::Index EmitBlockIndex(llvm::Type* index_ty);
   // Returns the index for the first tile in the block with the given block
   // index.
@@ -186,6 +199,13 @@ class KernelMappingScheme {
   int64 num_threads_x_;
   // Number of threads used to process elements in the Y direction of a tile.
   int64 num_threads_y_;
+
+  // When num_threads_x threads process a total of tile_size_x elements in the
+  // X dimension of a tile, each threads process n=tile_size_x/num_threads_x
+  // elements. When dilated_x=false, the n elements processed by a thread are
+  // contiguous. On the other hand, when dilated_x=true the n elements are
+  // dilated by a factor of num_threads_x.
+  bool dilated_x_;
 };
 
 // A class to represent information for tiled parameters to support IR emission
diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.cc b/tensorflow/compiler/xla/service/multi_output_fusion.cc
index 9ccdd7d8d818b9fa3aa77cdd10d37ca18928b448..53d52d9a3d918fa6dee093668923fcfff963d084 100644
--- a/tensorflow/compiler/xla/service/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/multi_output_fusion.cc
@@ -198,7 +198,7 @@ void MultiOutputFusion::Update(HloInstruction* instr1, HloInstruction* instr2) {
     if (instr == fusion || is_fused(instr) || is_connected(fusion, instr)) {
       continue;
     }
-    if (in_list.count(instr) > 0) {
+    if (in_list.contains(instr)) {
       continue;
     }
     int64 profit = GetProfit(instr, fusion);
diff --git a/tensorflow/compiler/xla/service/name_uniquer.cc b/tensorflow/compiler/xla/service/name_uniquer.cc
index daa718879ddd45afb02725b557380b2f49fe833e..f6feed29935a1446499559d947dff0a8eefe5d2e 100644
--- a/tensorflow/compiler/xla/service/name_uniquer.cc
+++ b/tensorflow/compiler/xla/service/name_uniquer.cc
@@ -34,7 +34,7 @@ bool IsAllowed(char character) {
 }  // namespace
 
 NameUniquer::NameUniquer(const string& separator) {
-  CHECK(std::all_of(separator.begin(), separator.end(), IsAllowed))
+  CHECK(absl::c_all_of(separator, IsAllowed))
       << "separator should comprises allowed characters only";
   separator_ = separator;
 }
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index c362a60d949a5b823b27b23cb0cb5f365e318cb3..9e3d1060210790f60243195a1c1dff13f1fc7fc5 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -1878,7 +1878,7 @@ class HloInstructionPattern {
   // Make this a templated function to work around gcc 4.9.4 template infinite
   // recursion bug.
   template <typename Dummy = void>
-  constexpr auto WithShapeEqualTo(const ::xla::Shape* shape)
+  constexpr auto WithShapeEqualTo(const ::xla::Shape* shape) const
       -> decltype(this->WithShape(Shape().EqualTo(shape))) {
     return WithShape(Shape().EqualTo(shape));
   }
@@ -1886,7 +1886,7 @@ class HloInstructionPattern {
   // Make this a templated function to work around gcc 4.9.4 template infinite
   // recursion bug.
   template <typename Dummy = void>
-  constexpr auto WithShapeCompatibleTo(const ::xla::Shape* shape)
+  constexpr auto WithShapeCompatibleTo(const ::xla::Shape* shape) const
       -> decltype(this->WithShape(Shape().CompatibleTo(shape))) {
     return WithShape(Shape().CompatibleTo(shape));
   }
@@ -2057,7 +2057,6 @@ XLA_UNOP_PATTERN(SendDone)
 XLA_UNOP_PATTERN(Sign)
 XLA_UNOP_PATTERN(Sin)
 XLA_UNOP_PATTERN(Slice)
-XLA_UNOP_PATTERN(Sort)
 XLA_UNOP_PATTERN(Tanh)
 XLA_UNOP_PATTERN(Transpose)
 #undef XLA_UNOP_PATTERN
@@ -2119,7 +2118,6 @@ XLA_BINOP_PATTERN(Divide)
 XLA_BINOP_PATTERN(Complex)
 XLA_BINOP_PATTERN(Convolution)
 XLA_BINOP_PATTERN(Dot)
-XLA_BINOP_PATTERN(DynamicSlice)
 XLA_COMMUTATIVE_BINOP_PATTERN(Eq)
 XLA_BINOP_PATTERN(Gather)
 XLA_BINOP_PATTERN(Ge)
@@ -2236,8 +2234,10 @@ inline auto WithOperands(Matcher&& m, int64 operand_num, FirstArg&& first_arg,
 XLA_VARIADIC_OP_PATTERN(AfterAll);
 XLA_VARIADIC_OP_PATTERN(Concatenate);
 XLA_VARIADIC_OP_PATTERN(CustomCall);
+XLA_VARIADIC_OP_PATTERN(DynamicSlice)
 XLA_VARIADIC_OP_PATTERN(Map)
 XLA_VARIADIC_OP_PATTERN(Reduce);
+XLA_VARIADIC_OP_PATTERN(Sort);
 XLA_VARIADIC_OP_PATTERN(Tuple);
 
 // Helpers for matching non-constant instructions.
diff --git a/tensorflow/compiler/xla/service/platform_util.cc b/tensorflow/compiler/xla/service/platform_util.cc
index 896b73cda41cb21b539b586aa4701c5bad43f8b9..0491f641fc7835c404612d6f8dcda83a02ca97d6 100644
--- a/tensorflow/compiler/xla/service/platform_util.cc
+++ b/tensorflow/compiler/xla/service/platform_util.cc
@@ -260,8 +260,8 @@ PlatformUtil::GetStreamExecutors(
     // Block here in thread_pool destructor until all devices are initialized.
   }
   VLOG(1) << "Device initialization complete";
-  if (std::all_of(stream_executors.begin(), stream_executors.end(),
-                  [](se::StreamExecutor* s) { return s == nullptr; })) {
+  if (absl::c_all_of(stream_executors,
+                     [](se::StreamExecutor* s) { return s == nullptr; })) {
     return InternalError("no supported devices found for platform %s",
                          platform->Name());
   }
diff --git a/tensorflow/compiler/xla/service/scatter_expander.cc b/tensorflow/compiler/xla/service/scatter_expander.cc
index e8496dbd72bbe0a2e78067dda209cdb4b620e826..036c3c36f648daf8963a6b25e300b93c1bdf78d9 100644
--- a/tensorflow/compiler/xla/service/scatter_expander.cc
+++ b/tensorflow/compiler/xla/service/scatter_expander.cc
@@ -26,7 +26,6 @@ limitations under the License.
 
 namespace xla {
 
-
 // Transposes the given scatter_indices such that the index_vector_dim becomes
 // the most-minor dimension.
 static StatusOr<HloInstruction*> TransposeIndexVectorDimToLast(
@@ -60,6 +59,13 @@ static StatusOr<HloInstruction*> CanonicalizeScatterIndices(
   TF_ASSIGN_OR_RETURN(
       HloInstruction * transposed_scatter_indices,
       TransposeIndexVectorDimToLast(scatter_indices, index_vector_dim));
+  if (scatter_indices->shape().rank() == index_vector_dim + 1 &&
+      scatter_indices->shape().dimensions(index_vector_dim) == 1) {
+    auto new_shape =
+        ShapeUtil::DeleteDimension(index_vector_dim, scatter_indices->shape());
+    TF_ASSIGN_OR_RETURN(scatter_indices,
+                        MakeReshapeHlo(new_shape, scatter_indices));
+  }
   bool indices_are_scalar =
       index_vector_dim == scatter_indices->shape().dimensions_size();
 
@@ -214,9 +220,6 @@ static StatusOr<std::vector<HloInstruction*>> ScatterLoopBody(
   HloInstruction* updates = loop_state[2];
 
   bool has_scalar_indices = scatter_indices->shape().dimensions_size() == 1;
-  CHECK_EQ(has_scalar_indices,
-           dim_numbers.index_vector_dim() ==
-               scatter->operand(1)->shape().dimensions_size());
 
   // Build a vector form of the induction variable of the while loop.
   TF_ASSIGN_OR_RETURN(
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index a0126f39b3dc4281abedc36a19dd20c3b128e249..2732d498d80121fcbff037d4e3bcd226c61cae2f 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
@@ -552,9 +553,7 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
     options.set_intra_op_thread_pool(
         backend->eigen_intra_op_thread_pool_device());
     options.set_device_assignment(&device_assignment);
-    run_options.emplace_back(
-        options, backend->StreamBorrower(),
-        /*xla_intra_op_thread_pool=*/backend->eigen_intra_op_thread_pool());
+    run_options.emplace_back(options, backend->StreamBorrower());
   }
 
   if (options_.number_of_replicas() == 1) {
@@ -1097,9 +1096,12 @@ Status Service::ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
                       CreateModuleFromProto(arg->computation(), config));
 
+  TF_ASSIGN_OR_RETURN(DynamicDimensionInference dynamic_dimension_inference,
+                      DynamicDimensionInference::Run(module.get()));
+
   HloEvaluator evaluator;
-  TF_ASSIGN_OR_RETURN(auto result_literal, evaluator.Evaluate<Literal>(
-                                               *module, /*arg_literals=*/{}));
+  evaluator.set_dynamic_dimension_inference(&dynamic_dimension_inference);
+  TF_ASSIGN_OR_RETURN(auto result_literal, evaluator.Evaluate(*module, {}));
 
   // Since the result layout is non-effective to the Evaluator results, explicit
   // relayout here.
diff --git a/tensorflow/compiler/xla/service/service_executable_run_options.h b/tensorflow/compiler/xla/service/service_executable_run_options.h
index dbfed628bfcabffe66bef41a82e0e2430897d80d..6bee671056552b83014367889320b748659bbfdf 100644
--- a/tensorflow/compiler/xla/service/service_executable_run_options.h
+++ b/tensorflow/compiler/xla/service/service_executable_run_options.h
@@ -32,12 +32,10 @@ class ServiceExecutableRunOptions {
   ServiceExecutableRunOptions()
       : ServiceExecutableRunOptions(ExecutableRunOptions()) {}
 
-  explicit ServiceExecutableRunOptions(
-      ExecutableRunOptions run_options, StreamBorrower borrow_stream = nullptr,
-      tensorflow::thread::ThreadPool* xla_intra_op_thread_pool = nullptr)
+  explicit ServiceExecutableRunOptions(ExecutableRunOptions run_options,
+                                       StreamBorrower borrow_stream = nullptr)
       : run_options_(std::move(run_options)),
-        borrow_stream_(std::move(borrow_stream)),
-        xla_intra_op_thread_pool_(xla_intra_op_thread_pool) {}
+        borrow_stream_(std::move(borrow_stream)) {}
 
   // Returns reference or pointer to `ExecutableRunOptions` member.
   const ExecutableRunOptions& run_options() const { return run_options_; }
@@ -56,15 +54,9 @@ class ServiceExecutableRunOptions {
                : Status(tensorflow::error::UNIMPLEMENTED, "No stream cache");
   }
 
-  // Returns reference to thread pool for execution of XLA ops on CPU backend.
-  tensorflow::thread::ThreadPool* xla_intra_op_thread_pool() const {
-    return xla_intra_op_thread_pool_;
-  }
-
  private:
   ExecutableRunOptions run_options_;
   StreamBorrower borrow_stream_;
-  tensorflow::thread::ThreadPool* xla_intra_op_thread_pool_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index b0e241d216dc5c6c3d6cbde2c12350f0647dd8ef..1d3f84af955df0736d8bbd87fe71152d8631bfee 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -534,9 +534,8 @@ Status ValidateDotDimensionNumbers(
                           absl::Span<const int64> contracting_dims,
                           absl::Span<const int64> batch_dims) -> bool {
     auto in_range = [&rank](int64 i) -> bool { return 0 <= i && i < rank; };
-    return std::all_of(contracting_dims.begin(), contracting_dims.end(),
-                       in_range) &&
-           std::all_of(batch_dims.begin(), batch_dims.end(), in_range);
+    return absl::c_all_of(contracting_dims, in_range) &&
+           absl::c_all_of(batch_dims, in_range);
   };
 
   absl::Span<const int64> lhs_contracting_dimensions =
@@ -563,9 +562,8 @@ Status ValidateDotDimensionNumbers(
     auto is_unique = [&dim_set](int64 i) -> bool {
       return dim_set.insert(i).second;
     };
-    return std::all_of(contracting_dims.begin(), contracting_dims.end(),
-                       is_unique) &&
-           std::all_of(batch_dims.begin(), batch_dims.end(), is_unique);
+    return absl::c_all_of(contracting_dims, is_unique) &&
+           absl::c_all_of(batch_dims, is_unique);
   };
 
   if (!dims_unique(lhs_contracting_dimensions, lhs_batch_dimensions) ||
@@ -1589,29 +1587,29 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   input_dnums[1] = dnums.input_feature_dimension();
   std::copy(dnums.input_spatial_dimensions().begin(),
             dnums.input_spatial_dimensions().end(), input_dnums.begin() + 2);
-  std::sort(input_dnums.begin(), input_dnums.end());
+  absl::c_sort(input_dnums);
 
   std::vector<int64> window_dnums(num_dims);
   window_dnums[0] = dnums.kernel_input_feature_dimension();
   window_dnums[1] = dnums.kernel_output_feature_dimension();
   std::copy(dnums.kernel_spatial_dimensions().begin(),
             dnums.kernel_spatial_dimensions().end(), window_dnums.begin() + 2);
-  std::sort(window_dnums.begin(), window_dnums.end());
+  absl::c_sort(window_dnums);
 
   std::vector<int64> output_dnums(num_dims);
   output_dnums[0] = dnums.output_batch_dimension();
   output_dnums[1] = dnums.output_feature_dimension();
   std::copy(dnums.output_spatial_dimensions().begin(),
             dnums.output_spatial_dimensions().end(), output_dnums.begin() + 2);
-  std::sort(output_dnums.begin(), output_dnums.end());
+  absl::c_sort(output_dnums);
 
   std::vector<int64> expected_dnums(num_dims);
   std::iota(expected_dnums.begin(), expected_dnums.end(), 0);
 
   const auto in_range = [num_dims](int64 i) { return 0 <= i && i < num_dims; };
-  if (!std::all_of(input_dnums.begin(), input_dnums.end(), in_range) ||
-      !std::all_of(window_dnums.begin(), window_dnums.end(), in_range) ||
-      !std::all_of(output_dnums.begin(), output_dnums.end(), in_range)) {
+  if (!absl::c_all_of(input_dnums, in_range) ||
+      !absl::c_all_of(window_dnums, in_range) ||
+      !absl::c_all_of(output_dnums, in_range)) {
     return InvalidArgument(
         "A dimension number is out of range in convolution: %s.",
         dnums.DebugString());
@@ -2087,35 +2085,81 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferDynamicSliceShape(
-    const Shape& operand_shape, const Shape& start_indices_shape,
-    absl::Span<const int64> slice_sizes) {
+    const Shape& operand_shape, absl::Span<const Shape> start_index_shapes,
+    absl::Span<const int64> slice_sizes, bool allow_scalar_indices) {
   TF_RETURN_IF_ERROR(ExpectArray(operand_shape, "operand of dynamic slice"));
-  TF_RETURN_IF_ERROR(
-      ExpectArray(start_indices_shape, "start indices of dynamic slice"));
+  auto number_of_indices = start_index_shapes.size();
+  // TODO(b/118437727): Remove this path.
+  if (!allow_scalar_indices ||
+      (number_of_indices >= 1 && start_index_shapes[0].rank() == 1)) {
+    if (number_of_indices != 1) {
+      return InvalidArgument(
+          "Dynamic slice should have exactly 1 index operand, has %d.",
+          number_of_indices);
+    }
 
-  VLOG(2) << StrFormat(
-      "slicing shape %s at dynamic start_indices %s with slice_sizes={%s}",
-      ShapeUtil::HumanString(operand_shape),
-      ShapeUtil::HumanString(start_indices_shape), StrJoin(slice_sizes, ", "));
+    const Shape& start_indices_shape = start_index_shapes[0];
+    VLOG(2) << StrFormat(
+        "slicing shape %s at dynamic start_indices %s with slice_sizes={%s}",
+        ShapeUtil::HumanString(operand_shape),
+        ShapeUtil::HumanString(start_indices_shape),
+        StrJoin(slice_sizes, ", "));
 
-  if (start_indices_shape.rank() != 1) {
-    return InvalidArgument(
-        "Dynamic slice start indices of rank %d must be rank1.",
-        start_indices_shape.rank());
-  }
+    TF_RETURN_IF_ERROR(
+        ExpectArray(start_indices_shape, "start indices of dynamic slice"));
 
-  if (!ShapeUtil::ElementIsIntegral(start_indices_shape)) {
-    return InvalidArgument(
-        "Dynamic slice start indices must be of integral type.");
-  }
+    if (start_indices_shape.rank() != 1) {
+      return InvalidArgument(
+          "Dynamic slice start indices of rank %d must be rank1.",
+          start_indices_shape.rank());
+    }
 
-  const int64 start_num_dims = start_indices_shape.dimensions(0);
-  if (operand_shape.rank() != start_num_dims) {
-    return InvalidArgument(
-        "Dynamic slice start number of dimensions %d (%s) must match rank "
-        "%d of slice input (%s).",
-        start_num_dims, ShapeUtil::HumanString(start_indices_shape),
-        operand_shape.rank(), ShapeUtil::HumanString(operand_shape));
+    if (!ShapeUtil::ElementIsIntegral(start_indices_shape)) {
+      return InvalidArgument(
+          "Dynamic slice start indices must be of integral type.");
+    }
+
+    const int64 start_num_dims = start_indices_shape.dimensions(0);
+    if (operand_shape.rank() != start_num_dims) {
+      return InvalidArgument(
+          "Dynamic slice start number of dimensions %d (%s) must match rank "
+          "%d of slice input (%s).",
+          start_num_dims, ShapeUtil::HumanString(start_indices_shape),
+          operand_shape.rank(), ShapeUtil::HumanString(operand_shape));
+    }
+  } else {
+    VLOG(2) << StrFormat("slicing shape %s a with slice_sizes={%s}",
+                         ShapeUtil::HumanString(operand_shape),
+                         StrJoin(slice_sizes, ", "));
+
+    if (operand_shape.rank() != number_of_indices) {
+      return InvalidArgument(
+          "Dynamic slice start number of dimensions %d must match rank "
+          "%d of slice input (%s).",
+          number_of_indices, operand_shape.rank(),
+          ShapeUtil::HumanString(operand_shape));
+    }
+
+    if (number_of_indices > 0) {
+      const Shape& first_index_shape = start_index_shapes[0];
+      if (!ShapeUtil::IsScalar(first_index_shape)) {
+        return InvalidArgument("Dynamic slice indices must be scalar, not %s.",
+                               ShapeUtil::HumanString(first_index_shape));
+      }
+      if (!ShapeUtil::ElementIsIntegral(first_index_shape)) {
+        return InvalidArgument(
+            "Dynamic slice start indices must be of integral type.");
+      }
+      for (const Shape& index_shape : start_index_shapes) {
+        if (!ShapeUtil::Compatible(first_index_shape, index_shape)) {
+          return InvalidArgument(
+              "Dynamic slice start indices must all have the same shape, got "
+              "mismatching indices with shapes %s and %s.",
+              ShapeUtil::HumanString(first_index_shape),
+              ShapeUtil::HumanString(index_shape));
+        }
+      }
+    }
   }
 
   if (slice_sizes.size() != operand_shape.rank()) {
@@ -2144,39 +2188,85 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 
 /* static */ StatusOr<Shape> ShapeInference::InferDynamicUpdateSliceShape(
     const Shape& operand_shape, const Shape& update_shape,
-    const Shape& start_indices_shape) {
+    absl::Span<const Shape> start_index_shapes, bool allow_scalar_indices) {
   TF_RETURN_IF_ERROR(
       ExpectArray(operand_shape, "operand of dynamic update slice"));
   TF_RETURN_IF_ERROR(
       ExpectArray(update_shape, "update of dynamic update slice"));
-  TF_RETURN_IF_ERROR(ExpectArray(start_indices_shape,
-                                 "start indices of dynamic update slice"));
 
-  VLOG(2) << StrFormat(
-      "updating slice of shape %s at dynamic start_indices %s with update "
-      "shape %s",
-      ShapeUtil::HumanString(operand_shape),
-      ShapeUtil::HumanString(start_indices_shape),
-      ShapeUtil::HumanString(update_shape));
+  auto number_of_indices = start_index_shapes.size();
+  // TODO(b/118437727): Remove this path.
+  if (!allow_scalar_indices ||
+      (number_of_indices >= 1 && start_index_shapes[0].rank() == 1)) {
+    if (number_of_indices != 1) {
+      return InvalidArgument(
+          "Dynamic update slice should have exactly 1 index operand, has %d.",
+          number_of_indices);
+    }
+    const Shape& start_indices_shape = start_index_shapes[0];
+    TF_RETURN_IF_ERROR(ExpectArray(start_indices_shape,
+                                   "start indices of dynamic update slice"));
 
-  if (start_indices_shape.rank() != 1) {
-    return InvalidArgument(
-        "Dynamic update slice start indices of rank %d must be rank1.",
-        start_indices_shape.rank());
-  }
+    VLOG(2) << StrFormat(
+        "updating slice of shape %s at dynamic start_indices %s with update "
+        "shape %s",
+        ShapeUtil::HumanString(operand_shape),
+        ShapeUtil::HumanString(start_indices_shape),
+        ShapeUtil::HumanString(update_shape));
 
-  if (!ShapeUtil::ElementIsIntegral(start_indices_shape)) {
-    return InvalidArgument(
-        "Dynamic update slice start indices must be of integral type.");
-  }
+    if (start_indices_shape.rank() != 1) {
+      return InvalidArgument(
+          "Dynamic update slice start indices of rank %d must be rank1.",
+          start_indices_shape.rank());
+    }
 
-  const int64 start_num_dims = start_indices_shape.dimensions(0);
-  if (operand_shape.rank() != start_num_dims) {
-    return InvalidArgument(
-        "Dynamic update slice start number of dimensions %d (%s) must match "
-        "rank %d of slice input (%s).",
-        start_num_dims, ShapeUtil::HumanString(start_indices_shape),
-        operand_shape.rank(), ShapeUtil::HumanString(operand_shape));
+    if (!ShapeUtil::ElementIsIntegral(start_indices_shape)) {
+      return InvalidArgument(
+          "Dynamic update slice start indices must be of integral type.");
+    }
+
+    const int64 start_num_dims = start_indices_shape.dimensions(0);
+    if (operand_shape.rank() != start_num_dims) {
+      return InvalidArgument(
+          "Dynamic update slice start number of dimensions %d (%s) must match "
+          "rank %d of slice input (%s).",
+          start_num_dims, ShapeUtil::HumanString(start_indices_shape),
+          operand_shape.rank(), ShapeUtil::HumanString(operand_shape));
+    }
+  } else {
+    VLOG(2) << StrFormat("updating slice of shape %s with update shape %s",
+                         ShapeUtil::HumanString(operand_shape),
+                         ShapeUtil::HumanString(update_shape));
+
+    if (operand_shape.rank() != number_of_indices) {
+      return InvalidArgument(
+          "Dynamic update slice start number of dimensions %d must match rank "
+          "%d of slice input (%s).",
+          number_of_indices, operand_shape.rank(),
+          ShapeUtil::HumanString(operand_shape));
+    }
+
+    if (number_of_indices > 0) {
+      const Shape& first_index_shape = start_index_shapes[0];
+      if (!ShapeUtil::IsScalar(first_index_shape)) {
+        return InvalidArgument(
+            "Dynamic update slice indices must be scalar, not %s.",
+            ShapeUtil::HumanString(first_index_shape));
+      }
+      if (!ShapeUtil::ElementIsIntegral(first_index_shape)) {
+        return InvalidArgument(
+            "Dynamic update slice start indices must be of integral type.");
+      }
+      for (const Shape& index_shape : start_index_shapes) {
+        if (!ShapeUtil::Compatible(first_index_shape, index_shape)) {
+          return InvalidArgument(
+              "Dynamic update slice start indices must all have the same "
+              "shape, got mismatching indices with shapes %s and %s.",
+              ShapeUtil::HumanString(first_index_shape),
+              ShapeUtil::HumanString(index_shape));
+        }
+      }
+    }
   }
 
   if (update_shape.rank() != operand_shape.rank()) {
@@ -2268,7 +2358,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   };
 
   // Check the shapes of computation parameters and return types.
-  if (!ShapeUtil::ShapeIs(condition.result(), PRED, {})) {
+  if (!ShapeUtil::Equal(condition.result(), ShapeUtil::MakeShape(PRED, {}))) {
     return InvalidArgument("Condition must return a boolean; got %s.",
                            shape_string());
   }
@@ -2288,7 +2378,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     const Shape& predicate, const Shape& true_operand,
     const Shape& false_operand, const ProgramShape& true_computation,
     const ProgramShape& false_computation) {
-  if (!ShapeUtil::ShapeIs(predicate, PRED, {})) {
+  if (!ShapeUtil::Equal(predicate, ShapeUtil::MakeShape(PRED, {}))) {
     return InvalidArgument("Predicate must be a boolean; got %s.",
                            ShapeUtil::HumanString(predicate));
   }
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index 1b8fd10d691498087b28ef68517868c5def1da5a..7d39ef38e05abf0a81683c1fb0f3999908b27d23 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -176,14 +176,15 @@ class ShapeInference {
   // Infers the shape produced by a dynamic slice operation of size specified
   // in 'slice_sizes', with dynamic start indices shape 'start_indices_shape'.
   static StatusOr<Shape> InferDynamicSliceShape(
-      const Shape& operand_shape, const Shape& start_indices_shape,
-      absl::Span<const int64> slice_sizes);
+      const Shape& operand_shape, absl::Span<const Shape> start_index_shapes,
+      absl::Span<const int64> slice_sizes, bool allow_scalar_indices = true);
 
   // Infers the shape produced by a dynamic update slice operation based
   // on the shape of operand and update.
   static StatusOr<Shape> InferDynamicUpdateSliceShape(
       const Shape& operand_shape, const Shape& update_shape,
-      const Shape& start_indices_shape);
+      absl::Span<const Shape> start_index_shapes,
+      bool allow_scalar_indices = true);
 
   // Infers the shape produced by doing a compile-time-constant indexing into
   // the given input shape. This is essential for operations on tuples, because
diff --git a/tensorflow/compiler/xla/service/transpose_folding.cc b/tensorflow/compiler/xla/service/transpose_folding.cc
index 15eb46bac0ac4e03d476ee21be6834174798526c..a95ca2bf2a8fcd700eb9234cafbfce9b62f2370c 100644
--- a/tensorflow/compiler/xla/service/transpose_folding.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding.cc
@@ -130,8 +130,7 @@ bool FoldTransposeIntoConvolution(InstructionOperandsPair pair) {
 
   HloInstruction* new_lhs;
   const int64 kLhsIdx = 0;
-  if (std::find(operand_indices.begin(), operand_indices.end(), kLhsIdx) !=
-      operand_indices.end()) {
+  if (absl::c_linear_search(operand_indices, kLhsIdx)) {
     HloInstruction& transpose = *convolution.mutable_operand(kLhsIdx);
     const auto& transpose_dimensions = transpose.dimensions();
     HloInstruction& transpose_operand = *transpose.mutable_operand(0);
@@ -154,8 +153,7 @@ bool FoldTransposeIntoConvolution(InstructionOperandsPair pair) {
 
   HloInstruction* new_rhs;
   const int64 kRhsIdx = 1;
-  if (std::find(operand_indices.begin(), operand_indices.end(), kRhsIdx) !=
-      operand_indices.end()) {
+  if (absl::c_linear_search(operand_indices, kRhsIdx)) {
     HloInstruction& transpose = *convolution.mutable_operand(kRhsIdx);
     const auto& transpose_dimensions = transpose.dimensions();
     HloInstruction& transpose_operand = *transpose.mutable_operand(0);
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index b1f0672c6089035153aa48c853bface116f70026..5e505aaf02f157d0cba9dff42b1a9b89a6691504 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -55,11 +56,10 @@ bool PointsToSet::IsAmbiguous() const {
 
 bool PointsToSet::IsDistinct() const {
   bool distinct = true;
-  std::set<const LogicalBuffer*> all_points_to;
-  ForEachElement([&distinct, &all_points_to](const ShapeIndex& /*index*/,
-                                             const BufferList& points_to) {
+  absl::flat_hash_set<const LogicalBuffer*> all_points_to;
+  ForEachElement([&](const ShapeIndex& /*index*/, const BufferList& points_to) {
     for (auto& buffer : points_to) {
-      if (all_points_to.count(buffer) != 0) {
+      if (all_points_to.contains(buffer)) {
         distinct = false;
       }
       all_points_to.insert(buffer);
@@ -87,9 +87,7 @@ bool PointsToSet::ContainsBuffer(const LogicalBuffer& buffer) const {
   bool found = false;
   ForEachElement([&found, &buffer](const ShapeIndex& /*index*/,
                                    const BufferList& pointed_to_buffers) {
-    if (!found &&
-        std::find(pointed_to_buffers.begin(), pointed_to_buffers.end(),
-                  &buffer) != pointed_to_buffers.end()) {
+    if (!found && absl::c_linear_search(pointed_to_buffers, &buffer)) {
       found = true;
     }
   });
@@ -99,8 +97,7 @@ bool PointsToSet::ContainsBuffer(const LogicalBuffer& buffer) const {
 bool PointsToSet::ContainsBufferAtIndex(const LogicalBuffer& buffer,
                                         const ShapeIndex& index) const {
   const auto& pointed_to_buffers = element(index);
-  return std::find(pointed_to_buffers.begin(), pointed_to_buffers.end(),
-                   &buffer) != pointed_to_buffers.end();
+  return absl::c_linear_search(pointed_to_buffers, &buffer);
 }
 
 void PointsToSet::AddPointedToBuffer(const LogicalBuffer& buffer,
@@ -604,9 +601,8 @@ bool TuplePointsToAnalysis::DoesNotUseOperandBuffer(
   } else if (user->opcode() == HloOpcode::kFusion &&
              user->fusion_kind() == HloInstruction::FusionKind::kLoop) {
     // Find fusion parameter associated with 'operand'.
-    auto it = std::find_if(
-        user->fused_parameters().begin(), user->fused_parameters().end(),
-        [=](HloInstruction* fused_param) {
+    auto it = absl::c_find_if(
+        user->fused_parameters(), [&](HloInstruction* fused_param) {
           return user->operand(fused_param->parameter_number()) == operand;
         });
     CHECK(it != user->fused_parameters().end());
@@ -672,9 +668,8 @@ bool TuplePointsToAnalysis::HasUniqueFusedUseOfOperandAt(
   }
   // Find fusion parameter associated with 'operand'.
   const auto& fused_params = fusion->fused_parameters();
-  auto fused_param_it = std::find_if(
-      fused_params.begin(), fused_params.end(),
-      [&](HloInstruction* fused_param) {
+  auto fused_param_it =
+      absl::c_find_if(fused_params, [&](HloInstruction* fused_param) {
         return fusion->operand(fused_param->parameter_number()) == operand;
       });
   if (fused_param_it == fused_params.end()) {
@@ -743,11 +738,10 @@ bool TuplePointsToAnalysis::CanShareOperandBufferWithUser(
       // Check if one operand of kAdd fused root is kDot or kConvolution.
       auto* add = user->fused_expression_root();
       auto add_operand_it =
-          std::find_if(add->operands().begin(), add->operands().end(),
-                       [&](HloInstruction* operand) {
-                         return operand->opcode() == HloOpcode::kConvolution ||
-                                operand->opcode() == HloOpcode::kDot;
-                       });
+          absl::c_find_if(add->operands(), [&](HloInstruction* operand) {
+            return operand->opcode() == HloOpcode::kConvolution ||
+                   operand->opcode() == HloOpcode::kDot;
+          });
       if (add_operand_it == add->operands().end()) {
         return false;
       }
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index d8875ca74716420ba0d54e4aba53c99001989996..7599e1e6adcff4571068c59b074fba5aaa9e9e8d 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -721,9 +721,8 @@ class FusionPointsToAnalysisTest : public TuplePointsToAnalysisTest {
   // to fusion 'operand'.
   HloInstruction* GetFusionParameterForOperand(HloInstruction* fusion,
                                                HloInstruction* operand) {
-    auto it = std::find_if(
-        fusion->fused_instructions().begin(),
-        fusion->fused_instructions().end(), [=](const HloInstruction* fused) {
+    auto it = absl::c_find_if(
+        fusion->fused_instructions(), [&](const HloInstruction* fused) {
           return fused->opcode() == HloOpcode::kParameter &&
                  fusion->operand(fused->parameter_number()) == operand;
         });
diff --git a/tensorflow/compiler/xla/service/while_loop_analysis.cc b/tensorflow/compiler/xla/service/while_loop_analysis.cc
index 68e2569f66bea9ec1223e454d1ead0efc7b9498e..c93a9ba3176002a34fe84a29e62075de4d19168f 100644
--- a/tensorflow/compiler/xla/service/while_loop_analysis.cc
+++ b/tensorflow/compiler/xla/service/while_loop_analysis.cc
@@ -301,7 +301,7 @@ optional<int64> ComputeWhileLoopTripCountUpperBound(HloInstruction* while_op) {
                                   /*dest_shape_index=*/{indvar_index},
                                   /*src_shape_index=*/{}));
   StatusOr<Literal> eval_result =
-      evaluator.Evaluate<Literal>(*while_cond, {std::move(fake_input)});
+      evaluator.Evaluate(*while_cond, {std::move(fake_input)});
 
   if (!eval_result.ok()) {
     VLOG(2) << "Couldn't evaluate while loop condition.";
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
index a1c627a319f19c83fd676000f9de971efe377dbd..69cc8feb3f31ad782b9d3437d81d0ab8ce10aadb 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
@@ -89,7 +89,7 @@ static void CreateLoopInvariantCopy(
 
     HloInstruction* next_operand =
         frame->instruction->mutable_operand(frame->operand_index++);
-    if (hoisted_instructions->count(next_operand) ||
+    if (hoisted_instructions->contains(next_operand) ||
         next_operand == while_body_param) {
       continue;
     }
@@ -241,7 +241,7 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
 
     auto is_invariant = [&](HloInstruction* op) {
       return hoisted_instructions.find(op) != hoisted_instructions.end() ||
-             unhoisted_invariant_instructions.count(op) ||
+             unhoisted_invariant_instructions.contains(op) ||
              op->opcode() == HloOpcode::kConstant;
     };
 
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
index 8e7c4bc8828552e197b41f874c070d496b85a382..3587c016b4420163a607422b1acc838646fab83a 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
@@ -299,7 +299,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistBitcastAlone) {
   // bitcast either.
   auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
-  auto scalar_f32 = ShapeUtil::MakeShape(F32, {});
+  auto effective_scalar_s32 = ShapeUtil::MakeShape(S32, {1});
   auto token_shape = ShapeUtil::MakeTokenShape();
   Shape while_shape =
       ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32, token_shape});
@@ -314,10 +314,12 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistBitcastAlone) {
         HloInstruction::CreateGetTupleElement(scalar_s32, param, 1));
     HloInstruction* in_token = builder.AddInstruction(
         HloInstruction::CreateGetTupleElement(token_shape, param, 2));
-    HloInstruction* bitcast_inst = builder.AddInstruction(
-        HloInstruction::CreateUnary(scalar_f32, HloOpcode::kBitcast, gte_0));
-    HloInstruction* out_token = builder.AddInstruction(
-        HloInstruction::CreateOutfeed(scalar_f32, bitcast_inst, in_token, ""));
+    HloInstruction* bitcast_inst =
+        builder.AddInstruction(HloInstruction::CreateUnary(
+            effective_scalar_s32, HloOpcode::kBitcast, gte_0));
+    HloInstruction* out_token =
+        builder.AddInstruction(HloInstruction::CreateOutfeed(
+            effective_scalar_s32, bitcast_inst, in_token, ""));
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_0, gte_1, out_token}));
 
@@ -352,9 +354,9 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistBitcastIfNeeded) {
   // The bitcast's user can be hoisted, so hoist the bitcast too.
   auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
-  auto scalar_f32 = ShapeUtil::MakeShape(F32, {});
-  Shape while_shape =
-      ShapeUtil::MakeTupleShape({scalar_s32, scalar_f32, scalar_f32});
+  auto effective_scalar_s32 = ShapeUtil::MakeShape(S32, {1});
+  Shape while_shape = ShapeUtil::MakeTupleShape(
+      {scalar_s32, effective_scalar_s32, effective_scalar_s32});
 
   HloComputation* while_body = [&]() {
     HloComputation::Builder builder(TestName() + ".while_body");
@@ -363,12 +365,13 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistBitcastIfNeeded) {
     HloInstruction* gte_0 = builder.AddInstruction(
         HloInstruction::CreateGetTupleElement(scalar_s32, param, 0));
     HloInstruction* gte_1 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_f32, param, 1));
-    HloInstruction* bitcast_inst = builder.AddInstruction(
-        HloInstruction::CreateUnary(scalar_f32, HloOpcode::kBitcast, gte_0));
+        HloInstruction::CreateGetTupleElement(effective_scalar_s32, param, 1));
+    HloInstruction* bitcast_inst =
+        builder.AddInstruction(HloInstruction::CreateUnary(
+            effective_scalar_s32, HloOpcode::kBitcast, gte_0));
     HloInstruction* add_inst =
         builder.AddInstruction(HloInstruction::CreateBinary(
-            scalar_f32, HloOpcode::kAdd, bitcast_inst, gte_1));
+            effective_scalar_s32, HloOpcode::kAdd, bitcast_inst, gte_1));
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_0, gte_1, add_inst}));
 
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
index 772faa25e299ebd96d46545905bbc66b6e23c82e..09d54095718029541a7a25aa62f9a2e9a177960d 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
@@ -109,8 +109,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
       // operand appears in, but it may appear more than once!
       if (user->user_count() == 1 && user->users().front() == while_body_root &&
           while_body_root->operand_index(user) == user->tuple_index() &&
-          std::count(while_body_root->operands().begin(),
-                     while_body_root->operands().end(), user) == 1) {
+          absl::c_count(while_body_root->operands(), user) == 1) {
         continue;
       }
 
@@ -127,7 +126,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   // through to the while body's root, count that element as "used", since
   // removing that element would be observable.
   for (int64 i = 0; i < while_body_root->operand_count(); ++i) {
-    if (used_tuple_indices.count(i)) {
+    if (used_tuple_indices.contains(i)) {
       continue;
     }
 
@@ -158,7 +157,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   // Build up maps from the old/new to the new/old tuple indices.
   std::vector<int64> new_to_old_tuple_idx(used_tuple_indices.begin(),
                                           used_tuple_indices.end());
-  std::sort(new_to_old_tuple_idx.begin(), new_to_old_tuple_idx.end());
+  absl::c_sort(new_to_old_tuple_idx);
 
   absl::flat_hash_map<int64, int64> old_to_new_tuple_idx;
   for (int64 new_idx = 0; new_idx < new_to_old_tuple_idx.size(); ++new_idx) {
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
index 3713989ca2f64ee1d94c9f77255017909d957da2..ecca76b1e86d833c73fbb9bad6a341660a7d2669 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
@@ -407,13 +407,12 @@ TEST_F(WhileLoopSimplifierTest, RemoveUnusedLoopOperands) {
   // The original while instruction is still left in the module as a dead
   // instruction, find a while instruction with a different name as the new
   // while instruction.
+  const auto& instrs = m->entry_computation()->instructions();
   HloInstruction* new_while_op =
-      *std::find_if(m->entry_computation()->instructions().begin(),
-                    m->entry_computation()->instructions().end(),
-                    [&](const HloInstruction* instr) {
-                      return (instr->opcode() == HloOpcode::kWhile &&
-                              instr->name() != "while");
-                    });
+      *absl::c_find_if(instrs, [&](const HloInstruction* instr) {
+        return (instr->opcode() == HloOpcode::kWhile &&
+                instr->name() != "while");
+      });
 
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   EXPECT_TRUE(
diff --git a/tensorflow/compiler/xla/shape.cc b/tensorflow/compiler/xla/shape.cc
index c04a91615437d315934866448b70846ccf3bad40..1a029efe8543b5433ef5fe7923e1e804019ba0c0 100644
--- a/tensorflow/compiler/xla/shape.cc
+++ b/tensorflow/compiler/xla/shape.cc
@@ -27,7 +27,19 @@ Shape::Shape(const ShapeProto& shape_proto) {
   for (const int64 dimension : shape_proto.dimensions()) {
     add_dimensions(dimension);
   }
-  for (int i = 0; i < shape_proto.is_dynamic_dimension_size(); i++) {
+  // A malformed proto may have different is_dynamic_dimension_size and
+  // dimensions_size. Since C++ is evil, and we have no good way of bailing out
+  // in a constructor, conservatively trim the is_dynamic_dimension size.
+  // TODO(b/120111794): Make this a hard error when we have a factory method
+  // instead of a constructor.
+  if (shape_proto.dimensions_size() !=
+      shape_proto.is_dynamic_dimension_size()) {
+    LOG(ERROR) << "Malformed shape proto: number of is_dynamic_dimension "
+                  "fields does not match number of dimension fields";
+  }
+  int64 num_dynamic_dimension_fields = std::min(
+      shape_proto.dimensions_size(), shape_proto.is_dynamic_dimension_size());
+  for (int i = 0; i < num_dynamic_dimension_fields; i++) {
     dynamic_dimensions_[i] = shape_proto.is_dynamic_dimension(i);
   }
   tuple_shapes_.reserve(shape_proto.tuple_shapes_size());
@@ -68,19 +80,18 @@ string Shape::ToString(bool print_layout) const {
 }
 
 bool Shape::is_static() const {
-  if (ShapeUtil::IsTuple(*this)) {
+  if (IsTuple()) {
     for (const Shape& subshape : tuple_shapes_) {
       if (!subshape.is_static()) {
         return false;
       }
     }
   }
-  return !std::any_of(dynamic_dimensions_.begin(), dynamic_dimensions_.end(),
-                      [](bool b) { return b; });
+  return !absl::c_any_of(dynamic_dimensions_, [](bool b) { return b; });
 }
 
 void Shape::DeleteDimension(int64 dim_to_delete) {
-  CHECK(ShapeUtil::IsArray(*this));
+  CHECK(IsArray());
   CHECK_GE(dim_to_delete, 0);
   CHECK_LT(dim_to_delete, dimensions_.size());
   dimensions_.erase(dimensions_.begin() + dim_to_delete);
diff --git a/tensorflow/compiler/xla/shape.h b/tensorflow/compiler/xla/shape.h
index fb5dc8fba430fdb52eb0ea6a98356032e7beef44..dc4cdc31a74d43471b72a71d9d436408e0e62deb 100644
--- a/tensorflow/compiler/xla/shape.h
+++ b/tensorflow/compiler/xla/shape.h
@@ -138,7 +138,7 @@ class Shape {
   string ShortDebugString() const { return ToProto().ShortDebugString(); }
   string DebugString() const { return ToProto().DebugString(); }
 
- public:
+ private:
   // The element type of this shape (tuple, array, etc).
   PrimitiveType element_type_ = PRIMITIVE_TYPE_INVALID;
 
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 7f68c0ae1fb1e3de12b0243829f8f02cc1137894..235b065585cb6a3dd76774e358d05e0a78a2f084 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -112,6 +112,7 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts,
 
   if (compare_layouts) {
     if (lhs.layout().format() != rhs.layout().format()) {
+      VLOG(3) << "CompareShapes: lhs layout format != rhs layout format";
       return false;
     }
     if (LayoutUtil::IsDenseArray(lhs)) {
@@ -145,7 +146,7 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts,
     return false;
   }
 
-  for (int i = 0; i < ShapeUtil::Rank(lhs); ++i) {
+  for (int i = 0; i < lhs.rank(); ++i) {
     if (lhs.is_dynamic_dimension(i) != rhs.is_dynamic_dimension(i)) {
       VLOG(3)
           << "CompareShapes: lhs and rhs have different dynamic dimensions.";
@@ -207,11 +208,6 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
   return equal;
 }
 
-/* static */ int64 ShapeUtil::Rank(const Shape& shape) {
-  CHECK(shape.IsArray()) << "Non-arrays do not have a rank, shape: " << shape;
-  return shape.dimensions_size();
-}
-
 /* static */ int64 ShapeUtil::TrueRank(const Shape& shape) {
   int64 accum = 0;
   for (int64 dimension : shape.dimensions()) {
@@ -343,7 +339,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 
 /* static */ void ShapeUtil::AppendMajorDimension(int bound, Shape* shape) {
   CHECK(LayoutUtil::IsDenseArray(*shape));
-  shape->mutable_layout()->add_minor_to_major(Rank(*shape));
+  shape->mutable_layout()->add_minor_to_major(shape->rank());
   shape->add_dimensions(bound);
   TF_DCHECK_OK(ValidateShape(*shape));
 }
@@ -358,7 +354,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 }
 
 /* static */ bool ShapeUtil::ElementHasBitWidth(const Shape& shape, int bits) {
-  if (!IsArray(shape)) {
+  if (!shape.IsArray()) {
     return false;
   }
   return primitive_util::BitWidth(shape.element_type()) == bits;
@@ -400,27 +396,24 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   return primitive_util::IsFloatingPointType(shape.element_type());
 }
 
-/* static */ bool ShapeUtil::IsArray(const Shape& shape) {
-  return IsArrayPrimitiveType(shape.element_type());
-}
-
 /* static */ bool ShapeUtil::IsNestedTuple(const Shape& shape) {
-  return IsTuple(shape) && std::any_of(shape.tuple_shapes().begin(),
-                                       shape.tuple_shapes().end(), IsTuple);
+  return shape.IsTuple() &&
+         absl::c_any_of(shape.tuple_shapes(),
+                        [](const Shape& s) { return s.IsTuple(); });
 }
 
 /* static */ bool ShapeUtil::IsEmptyTuple(const Shape& shape) {
-  return IsTuple(shape) && TupleElementCount(shape) == 0;
+  return shape.IsTuple() && TupleElementCount(shape) == 0;
 }
 
 /* static */ int64 ShapeUtil::TupleElementCount(const Shape& shape) {
-  CHECK(IsTuple(shape)) << HumanString(shape);
+  CHECK(shape.IsTuple()) << HumanString(shape);
   return shape.tuple_shapes_size();
 }
 
 /* static */ const Shape& ShapeUtil::GetTupleElementShape(const Shape& shape,
                                                           int64 index) {
-  CHECK(IsTuple(shape));
+  CHECK(shape.IsTuple());
   CHECK_GT(TupleElementCount(shape), index);
   TF_DCHECK_OK(ValidateShapeWithOptionalLayout(shape.tuple_shapes(index)));
   return shape.tuple_shapes(index);
@@ -436,7 +429,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 /* static */ Shape ShapeUtil::SliceTuple(const Shape& tuple, int64 start,
                                          int64 limit) {
   TF_DCHECK_OK(ValidateShapeWithOptionalLayout(tuple));
-  CHECK(IsTuple(tuple));
+  CHECK(tuple.IsTuple());
   CHECK_LE(start, TupleElementCount(tuple));
   CHECK_LE(limit, TupleElementCount(tuple));
 
@@ -453,15 +446,9 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
                                               complex_shape.element_type()));
 }
 
-/* static */ bool ShapeUtil::ShapeIs(const Shape& shape,
-                                     PrimitiveType element_type,
-                                     std::initializer_list<int64> dimensions) {
-  return Equal(shape, MakeShape(element_type, dimensions));
-}
-
 /* static */ int64 ShapeUtil::ElementsIn(const Shape& shape) {
-  DCHECK(IsArray(shape)) << ShapeUtil::HumanString(shape);
-  DCHECK_EQ(shape.dimensions_size(), Rank(shape));
+  DCHECK(shape.IsArray()) << ShapeUtil::HumanString(shape);
+  DCHECK_EQ(shape.dimensions_size(), shape.rank());
   if (shape.dimensions().size() == 1) {
     return shape.dimensions()[0];
   }
@@ -471,8 +458,8 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 }
 
 /* static */ int64 ShapeUtil::ElementsInRecursive(const Shape& shape) {
-  CHECK(IsArray(shape) || IsTuple(shape));
-  if (IsArray(shape)) {
+  CHECK(shape.IsArray() || shape.IsTuple());
+  if (shape.IsArray()) {
     return ElementsIn(shape);
   }
   int64 count = 0;
@@ -505,7 +492,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 }
 
 /* static */ string ShapeUtil::HumanString(const Shape& shape) {
-  if (IsTuple(shape)) {
+  if (shape.IsTuple()) {
     string text = "(";
     const char* prefix = "";
     for (const Shape& elem_shape : shape.tuple_shapes()) {
@@ -529,7 +516,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 }
 
 /* static */ string ShapeUtil::HumanStringWithLayout(const Shape& shape) {
-  if (IsTuple(shape)) {
+  if (shape.IsTuple()) {
     string text = "(";
     const char* prefix = "";
     for (const Shape& elem_shape : shape.tuple_shapes()) {
@@ -545,7 +532,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
     StrAppend(&result, (i > 0) ? "," : "", shape.dimensions(i));
   }
   result += "]";
-  if (!IsScalar(shape) && IsArray(shape)) {
+  if (!IsScalar(shape) && shape.IsArray()) {
     if (LayoutUtil::HasLayout(shape)) {
       StrAppend(&result, LayoutUtil::HumanString(shape.layout()));
     }
@@ -580,8 +567,8 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 
 /* static */ bool ShapeUtil::CompatibleIgnoringElementType(const Shape& lhs,
                                                            const Shape& rhs) {
-  if (IsArray(lhs)) {
-    return IsArray(rhs) && SameDimensions(lhs, rhs);
+  if (lhs.IsArray()) {
+    return rhs.IsArray() && SameDimensions(lhs, rhs);
   } else if (lhs.element_type() == TUPLE) {
     return rhs.element_type() == TUPLE &&
            absl::c_equal(lhs.tuple_shapes(), rhs.tuple_shapes(),
@@ -594,8 +581,8 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 
 /* static */ bool ShapeUtil::CompatibleIgnoringFpPrecision(const Shape& lhs,
                                                            const Shape& rhs) {
-  if (IsArray(lhs)) {
-    return IsArray(rhs) && SameElementTypeIgnoringFpPrecision(lhs, rhs) &&
+  if (lhs.IsArray()) {
+    return rhs.IsArray() && SameElementTypeIgnoringFpPrecision(lhs, rhs) &&
            CompatibleIgnoringElementType(lhs, rhs);
   } else if (lhs.element_type() == TUPLE) {
     return rhs.element_type() == TUPLE &&
@@ -615,7 +602,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 /* static */ int64 ShapeUtil::GetDimensionNumber(const Shape& shape,
                                                  int64 dimension_number) {
   if (dimension_number < 0) {
-    dimension_number += Rank(shape);
+    dimension_number += shape.rank();
   }
   CHECK_GE(dimension_number, 0);
   return dimension_number;
@@ -669,7 +656,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   TF_DCHECK_OK(ValidateShape(shape));
   if (shape.element_type() == TUPLE) {
     return ByteSizeOfTupleIndexTable(shape, pointer_size);
-  } else if (IsArray(shape)) {
+  } else if (shape.IsArray()) {
     int64 byte_size = ByteSizeOfElements(shape);
     if (LayoutUtil::IsSparseArray(shape)) {
       byte_size += ByteSizeOfSparseIndices(shape);
@@ -755,10 +742,10 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
     return Status::OK();
   }
 
-  if (LayoutUtil::IsSparseArray(shape) && Rank(shape) == 0) {
+  if (LayoutUtil::IsSparseArray(shape) && shape.rank() == 0) {
     return InvalidArgument("sparse arrays must have rank > 0");
   }
-  for (int64 i = 0; i < Rank(shape); ++i) {
+  for (int64 i = 0; i < shape.rank(); ++i) {
     int64 dimension = shape.dimensions(i);
     if (dimension < 0) {
       return InvalidArgument(
@@ -774,7 +761,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 /* static */ Status ShapeUtil::ValidateShapeSize(const Shape& shape) {
   VLOG(3) << "Validating shape size: " << ShapeUtil::HumanString(shape);
 
-  if (!IsArray(shape)) {
+  if (!shape.IsArray()) {
     return Status::OK();
   }
 
@@ -867,7 +854,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
                                           ShapeIndexView index) {
   const Shape* subshape = &shape;
   for (auto i : index) {
-    if (!IsTuple(*subshape) || i >= subshape->tuple_shapes_size() || i < 0) {
+    if (!subshape->IsTuple() || i >= subshape->tuple_shapes_size() || i < 0) {
       return false;
     }
     subshape = &subshape->tuple_shapes(i);
@@ -879,7 +866,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
                                                  ShapeIndexView index) {
   const Shape* return_shape = &shape;
   for (auto i : index) {
-    CHECK(IsTuple(*return_shape))
+    CHECK(return_shape->IsTuple())
         << "Invalid index " << index << " for shape " << shape;
     return_shape = &return_shape->tuple_shapes(i);
   }
@@ -890,7 +877,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
     const Shape& shape, ShapeIndexView index) {
   const Shape* return_shape = &shape;
   for (auto i : index) {
-    if (!IsTuple(*return_shape) || i < 0 ||
+    if (!return_shape->IsTuple() || i < 0 ||
         i >= return_shape->tuple_shapes_size()) {
       return InvalidArgument(
           "Shape index %s not a valid subshape index for tuple with shape %s",
@@ -905,7 +892,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
                                                   ShapeIndexView index) {
   Shape* return_shape = shape;
   for (auto i : index) {
-    CHECK(IsTuple(*return_shape));
+    CHECK(return_shape->IsTuple());
     return_shape = return_shape->mutable_tuple_shapes(i);
   }
   return return_shape;
@@ -913,11 +900,11 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 
 /* static */
 bool ShapeUtil::IsLeafIndex(const Shape& shape, const ShapeIndex& index) {
-  return !IsTuple(GetSubshape(shape, index));
+  return !GetSubshape(shape, index).IsTuple();
 }
 
 /* static */ int64 ShapeUtil::GetLeafCount(const Shape& shape) {
-  if (!IsTuple(shape)) {
+  if (!shape.IsTuple()) {
     return 1;
   }
   int64 count = 0;
@@ -1081,8 +1068,8 @@ Status ForEachMutableSubshapeHelper(
 /* static */ std::tuple<bool, std::vector<int64>, std::vector<int64>>
 ShapeUtil::InsertedOrDeleted1SizedDimensions(const Shape& shape_pre,
                                              const Shape& shape_post) {
-  CHECK(IsArray(shape_pre));
-  CHECK(IsArray(shape_post));
+  CHECK(shape_pre.IsArray());
+  CHECK(shape_post.IsArray());
 
   auto nil = std::make_tuple(false, std::vector<int64>(), std::vector<int64>());
 
@@ -1129,7 +1116,7 @@ ShapeUtil::InsertedOrDeleted1SizedDimensions(const Shape& shape_pre,
     auto unmodified_dim_pair =
         i < unmodified_dims.size()
             ? unmodified_dims[i]
-            : std::make_pair(Rank(shape_pre), Rank(shape_post));
+            : std::make_pair(shape_pre.rank(), shape_post.rank());
     if (!check_modified_dims(prior_unmodified_dim_pair, unmodified_dim_pair)) {
       return nil;
     }
@@ -1141,8 +1128,8 @@ ShapeUtil::InsertedOrDeleted1SizedDimensions(const Shape& shape_pre,
 /* static */ std::vector<std::pair<int64, int64>>
 ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
                                          const Shape& output_shape) {
-  CHECK(IsArray(input_shape));
-  CHECK(IsArray(output_shape));
+  CHECK(input_shape.IsArray());
+  CHECK(output_shape.IsArray());
 
   // Unmodified dimensions are merely common factors of rank 1.
   auto common_factors = CommonFactors(AsInt64Slice(input_shape.dimensions()),
@@ -1192,8 +1179,8 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
 
 /* static */ bool ShapeUtil::ReshapeIsBitcast(const Shape& input_shape,
                                               const Shape& output_shape) {
-  CHECK(IsArray(input_shape));
-  CHECK(IsArray(output_shape));
+  CHECK(input_shape.IsArray());
+  CHECK(output_shape.IsArray());
   CHECK(LayoutUtil::HasLayout(input_shape));
   CHECK(LayoutUtil::HasLayout(output_shape));
 
@@ -1321,12 +1308,12 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
     Shape output_shape_dim0_major = MakeShapeWithDescendingLayout(
         output_shape.element_type(), AsInt64Slice(output_shape.dimensions()));
 
-    for (int64 input_dim = 0; input_dim < Rank(input_shape); ++input_dim) {
+    for (int64 input_dim = 0; input_dim < input_shape.rank(); ++input_dim) {
       if (input_shape.dimensions(input_dim) <= 1) {
         continue;
       }
 
-      std::vector<int64> input_unit_index(Rank(input_shape), 0);
+      std::vector<int64> input_unit_index(input_shape.rank(), 0);
       input_unit_index[input_dim] = 1;
       int64 logical_linear_index =
           IndexUtil::MultidimensionalIndexToLinearIndex(input_shape_dim0_major,
@@ -1352,11 +1339,11 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
 
 /* static */ absl::optional<Shape> ShapeUtil::AlignLayouts(
     const Shape& input_shape, const Shape& output_shape) {
-  CHECK(IsArray(input_shape));
-  CHECK(IsArray(output_shape));
+  CHECK(input_shape.IsArray());
+  CHECK(output_shape.IsArray());
 
-  int64 input_rank = Rank(input_shape);
-  int64 output_rank = Rank(output_shape);
+  int64 input_rank = input_shape.rank();
+  int64 output_rank = output_shape.rank();
 
   // First, calculate an alignment of the dimensions. A consecutive sequence of
   // input dimensions and output dimensions belong to the same alignment part if
@@ -1493,14 +1480,14 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
 
 /* static */ Shape ShapeUtil::DeleteDimension(int64 dim_to_delete,
                                               Shape shape) {
-  CHECK(IsArray(shape));
+  CHECK(shape.IsArray());
   shape.DeleteDimension(dim_to_delete);
   return shape;
 }
 
 /* static */ Shape ShapeUtil::FilterDimensions(
     const std::function<bool(int64)>& p, Shape shape) {
-  CHECK(IsArray(shape));
+  CHECK(shape.IsArray());
   std::vector<int64> dims_to_delete;
   for (int64 i = shape.dimensions().size() - 1; i >= 0; --i) {
     if (!p(i)) {
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index c8295e85ce15722a3da7ab5585e6d499d3b5efa2..e98c6e024bec1f6db5c40d3cd3215ca44eb13698 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -185,7 +185,7 @@ class ShapeUtil {
   // may not actually be able to store this number of elements. See
   // LayoutUtil::MaxSparseElements(shape) to obtain the maximum number of
   // elements that can be stored in a sparse shape.
-  // Precondition: IsArray(shape)
+  // Precondition: shape.IsArray()
   static int64 ElementsIn(const Shape& shape);
 
   // As ElementsIn(), but recurses through tuples.
@@ -296,11 +296,6 @@ class ShapeUtil {
   // As Equal, but allow one of lhs and rhs to be F16 while the other is F32.
   static bool EqualIgnoringFpPrecision(const Shape& lhs, const Shape& rhs);
 
-  // Returns the rank (number of dimensions) of the given shape.
-  // Precondition: !IsTuple(shape)
-  ABSL_DEPRECATED("Use `Shape::rank` instead.")
-  static int64 Rank(const Shape& shape);
-
   // Returns the number of dimensions for which the dimension is not (trivially)
   // 1. e.g., f32[2x1x1] has a true rank of 1D, the other dimensions are just
   // fluff. Note that zero dimensions are included in the true rank, e.g.,
@@ -314,10 +309,10 @@ class ShapeUtil {
   // Scalar-specific
 
   static bool IsScalar(const Shape& shape) {
-    return IsArray(shape) && Rank(shape) == 0;
+    return shape.IsArray() && shape.rank() == 0;
   }
   static bool IsEffectiveScalar(const Shape& shape) {
-    return IsArray(shape) && TrueRank(shape) == 0;
+    return shape.IsArray() && TrueRank(shape) == 0;
   }
 
   // Returns whether "shape" is a scalar (array) with the given element_type.
@@ -457,31 +452,6 @@ class ShapeUtil {
   // that floating point numbers are signed.
   static bool ElementIsSigned(const Shape& shape);
 
-  // Returns whether the shape is a tuple.
-  ABSL_DEPRECATED("Use Shape::IsTuple instead.")
-  static bool IsTuple(const Shape& shape) {
-    return shape.element_type() == TUPLE;
-  }
-
-  // Returns whether the shape is an opaque value (i.e. an 'existential' typed
-  // value that is passed to CustomCall operations).
-  ABSL_DEPRECATED("Use Shape::IsOpaque instead.")
-  static bool IsOpaque(const Shape& shape) {
-    return shape.element_type() == OPAQUE;
-  }
-
-  // Returns whether the shape is an token value used for ordering
-  // side-effecting operations.
-  ABSL_DEPRECATED("Use Shape::IsToken instead.")
-  static bool IsToken(const Shape& shape) {
-    return shape.element_type() == TOKEN;
-  }
-
-  // Returns whether the shape is an array.  Note that scalars are considered
-  // arrays.
-  ABSL_DEPRECATED("Use Shape::IsArray instead.")
-  static bool IsArray(const Shape& shape);
-
   // Returns whether the given primitive type corresponds to an array shape.
   static bool IsArrayPrimitiveType(PrimitiveType primitive_type);
 
@@ -511,12 +481,6 @@ class ShapeUtil {
   // shape.
   static Shape ComplexComponentShape(const Shape& complex_shape);
 
-  // Shorthand for testing whether a shape is of a given element type and
-  // sequence of dimensions.
-  ABSL_DEPRECATED("Use Equal() instead.")
-  static bool ShapeIs(const Shape& shape, PrimitiveType element_type,
-                      std::initializer_list<int64> dimensions);
-
   // Returns true if the given shape has a subshape at the given index.
   static bool IndexIsValid(const Shape& shape, ShapeIndexView index);
 
@@ -764,7 +728,7 @@ class ShapeUtil {
     if (ShapeUtil::IsZeroElementArray(shape)) {
       return Status::OK();
     }
-    CHECK_EQ(Rank(shape), base.size());
+    CHECK_EQ(shape.rank(), base.size());
     CHECK_EQ(incr.size(), base.size());
     CHECK_EQ(count.size(), base.size());
     const int64 rank = LayoutUtil::MinorToMajor(shape).size();
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 8e7c20819335bba161185bcf1237f709cfdb2a5d..61b4e73e060c18a3d0108e68d1117607d6c11c0f 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -538,10 +538,6 @@ TEST(ShapeUtilTest, InsertedOrDeleted1SizedDimensions) {
       ShapeUtil::InsertedOrDeleted1SizedDimensions(shape0, shape2)));
 }
 
-TEST(ShapeUtilTest, ShapeIs) {
-  EXPECT_FALSE(ShapeUtil::ShapeIs(ShapeUtil::MakeShape(PRED, {2}), PRED, {}));
-}
-
 TEST(ShapeUtilTest, ForEachIndex) {
   struct ShapeDimensionAndNumberInvocations {
     std::vector<int64> dimensions;
diff --git a/tensorflow/compiler/xla/sparse_index_array.h b/tensorflow/compiler/xla/sparse_index_array.h
index a96d483462efd77ae4761541e8c79b2c84fa49f3..0c25355467da3fd346d80db790d78252869975ef 100644
--- a/tensorflow/compiler/xla/sparse_index_array.h
+++ b/tensorflow/compiler/xla/sparse_index_array.h
@@ -135,7 +135,7 @@ void SparseIndexArray::SortWithValues(absl::Span<NativeT> values) {
   auto sort_order_less = [this](int64 lhs, int64 rhs) {
     return IndexUtil::CompareIndices(At(lhs), At(rhs)) < 0;
   };
-  std::sort(sort_order.begin(), sort_order.end(), sort_order_less);
+  absl::c_sort(sort_order, sort_order_less);
 
   // Reorder the array elements according to sort_order.  Work through the array
   // and follow cycles so we can do the reorder in-place.
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index ee24d4d99cb1f7ce51a72c6258cbadd6adf12f81..0fd0fc108a6c5432cf8f3a74006949d5a46b2b99 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -71,6 +71,7 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:hlo_dataflow_analysis",
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:transfer_manager",
@@ -276,9 +277,6 @@ cc_library(
 xla_test(
     name = "bad_rng_shape_validation_test",
     srcs = ["bad_rng_shape_validation_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
@@ -344,9 +342,6 @@ xla_test(
 xla_test(
     name = "check_execution_arity_test",
     srcs = ["check_execution_arity_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -367,9 +362,6 @@ xla_test(
 xla_test(
     name = "query_inferred_shape_test",
     srcs = ["query_inferred_shape_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
@@ -387,9 +379,6 @@ xla_test(
 xla_test(
     name = "while_test",
     srcs = ["while_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -413,6 +402,10 @@ xla_test(
 xla_test(
     name = "xla_hlo_profile_test",
     srcs = ["xla_hlo_profile_test.cc"],
+    blacklisted_backends = [
+        # Hlo profiles are not supported on the interpreter backend.
+        "interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:shape_util",
@@ -436,9 +429,6 @@ xla_test(
 xla_test(
     name = "axpy_simple_test",
     srcs = ["axpy_simple_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
@@ -453,7 +443,6 @@ xla_test(
 xla_test(
     name = "map_test",
     srcs = ["map_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
@@ -506,9 +495,6 @@ xla_test(
 xla_test(
     name = "pred_test",
     srcs = ["pred_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla/client:local_client",
@@ -524,9 +510,6 @@ xla_test(
 xla_test(
     name = "select_test",
     srcs = ["select_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
@@ -544,7 +527,6 @@ xla_test(
 xla_test(
     name = "conditional_test",
     srcs = ["conditional_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
@@ -562,7 +544,6 @@ xla_test(
 xla_test(
     name = "unary_op_test",
     srcs = ["unary_op_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
@@ -623,9 +604,6 @@ xla_test(
 xla_test(
     name = "deconstruct_tuple_test",
     srcs = ["deconstruct_tuple_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -648,7 +626,6 @@ xla_test(
     name = "array_elementwise_ops_test",
     srcs = ["array_elementwise_ops_test.cc"],
     shard_count = 25,
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -698,7 +675,6 @@ xla_test(
 xla_test(
     name = "reduce_precision_test",
     srcs = ["reduce_precision_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
@@ -725,7 +701,6 @@ xla_test(
     srcs = ["dot_operation_test.cc"],
     shard_count = 20,
     tags = [
-        "enable_for_xla_interpreter",
         "optonly",
     ],
     deps = [
@@ -736,6 +711,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -793,6 +769,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -806,9 +783,6 @@ xla_test(
 xla_test(
     name = "transpose_test",
     srcs = ["transpose_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:reference_util",
@@ -828,9 +802,6 @@ xla_test(
 xla_test(
     name = "constants_test",
     srcs = ["constants_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -951,6 +922,11 @@ xla_test(
 xla_test(
     name = "batch_normalization_test",
     srcs = ["batch_normalization_test.cc"],
+    blacklisted_backends = [
+        # BatchNorm HLOs are not handled by the interpreter backend, and the
+        # BatchNorm expander is not run on the interpreter.
+        "interpreter",
+    ],
     shard_count = 40,
     deps = [
         ":test_utils",
@@ -1042,9 +1018,6 @@ xla_test(
     name = "slice_test",
     srcs = ["slice_test.cc"],
     shard_count = 40,
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:reference_util",
@@ -1065,9 +1038,6 @@ xla_test(
 xla_test(
     name = "multidimensional_slice_test",
     srcs = ["multidimensional_slice_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -1085,9 +1055,6 @@ xla_test(
     name = "dynamic_ops_test",
     timeout = "moderate",
     srcs = ["dynamic_ops_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:reference_util",
@@ -1113,9 +1080,6 @@ xla_test(
 xla_test(
     name = "tuple_test",
     srcs = ["tuple_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
@@ -1139,9 +1103,6 @@ xla_test(
 xla_test(
     name = "vector_ops_reduce_test",
     srcs = ["vector_ops_reduce_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -1162,7 +1123,6 @@ xla_test(
     srcs = ["reduce_test.cc"],
     shard_count = 40,
     tags = [
-        "enable_for_xla_interpreter",
         "optonly",
     ],
     deps = [
@@ -1229,7 +1189,6 @@ xla_test(
     srcs = [],
     shard_count = 20,
     tags = [
-        "enable_for_xla_interpreter",
         "optonly",
     ],
     xla_test_library_deps = [":reduce_window_test_library"],
@@ -1241,7 +1200,6 @@ xla_test(
     timeout = "long",
     srcs = ["select_and_scatter_test.cc"],
     tags = [
-        "enable_for_xla_interpreter",
         "optonly",
     ],
     deps = [
@@ -1267,9 +1225,6 @@ xla_test(
 xla_test(
     name = "copy_test",
     srcs = ["copy_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         ":client_library_test_base",
         "//tensorflow/compiler/xla:array2d",
@@ -1290,9 +1245,6 @@ xla_test(
 xla_test(
     name = "reduce_hlo_test",
     srcs = ["reduce_hlo_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -1306,9 +1258,6 @@ xla_test(
 xla_test(
     name = "token_hlo_test",
     srcs = ["token_hlo_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_verifier",
@@ -1323,9 +1272,6 @@ xla_test(
 xla_test(
     name = "call_test",
     srcs = ["call_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
@@ -1368,9 +1314,6 @@ xla_test(
 xla_test(
     name = "binop_scaling_test",
     srcs = ["binop_scaling_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
@@ -1388,9 +1331,6 @@ xla_test(
 xla_test(
     name = "broadcast_simple_test",
     srcs = ["broadcast_simple_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
@@ -1410,9 +1350,6 @@ xla_test(
 xla_test(
     name = "pad_test",
     srcs = ["pad_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
@@ -1434,9 +1371,6 @@ xla_test(
 xla_test(
     name = "fmax_test",
     srcs = ["fmax_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
@@ -1451,9 +1385,6 @@ xla_test(
 xla_test(
     name = "log_test",
     srcs = ["log_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
@@ -1468,9 +1399,6 @@ xla_test(
 xla_test(
     name = "matrix_ops_simple_test",
     srcs = ["matrix_ops_simple_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
@@ -1517,9 +1445,6 @@ xla_test(
     name = "reshape_test",
     srcs = ["reshape_test.cc"],
     shard_count = 30,
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
@@ -1545,9 +1470,6 @@ xla_test(
 xla_test(
     name = "reverse_test",
     srcs = ["reverse_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
@@ -1566,9 +1488,6 @@ xla_test(
 xla_test(
     name = "vector_ops_simple_test",
     srcs = ["vector_ops_simple_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:shape_util",
@@ -1592,9 +1511,6 @@ xla_test(
 xla_test(
     name = "concat_test",
     srcs = ["concat_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -1615,9 +1531,6 @@ xla_test(
 xla_test(
     name = "convert_test",
     srcs = ["convert_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -1637,6 +1550,10 @@ xla_test(
 xla_test(
     name = "all_reduce_test",
     srcs = ["all_reduce_test.cc"],
+    blacklisted_backends = [
+        # All reduce is not supported on the interpreter backend.
+        "interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -1661,9 +1578,6 @@ xla_test(
 xla_test(
     name = "bitcast_convert_test",
     srcs = ["bitcast_convert_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -1703,9 +1617,6 @@ xla_test(
 xla_test(
     name = "floor_ceil_test",
     srcs = ["floor_ceil_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
@@ -1767,6 +1678,10 @@ xla_test(
 xla_test(
     name = "execution_profile_test",
     srcs = ["execution_profile_test.cc"],
+    blacklisted_backends = [
+        # Execution profiles are not supported on the interpreter backend.
+        "interpreter",
+    ],
     deps = [
         ":client_library_test_base",
         "//tensorflow/compiler/xla/client:global_data",
@@ -1781,6 +1696,10 @@ xla_test(
     name = "execution_profile_test_with_xla_hlo_profile",
     srcs = ["execution_profile_test.cc"],
     args = ["--xla_hlo_profile"],
+    blacklisted_backends = [
+        # Hlo profiles are not supported on the interpreter backend.
+        "interpreter",
+    ],
     deps = [
         ":client_library_test_base",
         "//tensorflow/compiler/xla/client:global_data",
@@ -1794,9 +1713,6 @@ xla_test(
 xla_test(
     name = "replay_test",
     srcs = ["replay_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:protobuf_util",
@@ -1819,9 +1735,6 @@ xla_test(
 xla_test(
     name = "broadcast_test",
     srcs = ["broadcast_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -1883,9 +1796,6 @@ xla_test(
 xla_test(
     name = "fusion_test",
     srcs = ["fusion_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
@@ -2003,6 +1913,10 @@ xla_test(
 xla_test(
     name = "outfeed_in_nested_computation_test",
     srcs = ["outfeed_in_nested_computation_test.cc"],
+    blacklisted_backends = [
+        # Outfeed ops are not supported on the interpreter backend.
+        "interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla/tests:local_client_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -2179,7 +2093,6 @@ xla_test(
     srcs = ["iota_test.cc"],
     shard_count = 30,
     tags = [
-        "enable_for_xla_interpreter",
         # Require optimized builds, iota_test_cpu is very slow in fastbuild.
         "optonly",
     ],
@@ -2207,3 +2120,18 @@ tf_cc_test(
         "@com_google_absl//absl/synchronization",
     ],
 )
+
+xla_test(
+    name = "ptxas_bug_120501638",
+    srcs = ["ptxas_bug_120501638.cc"],
+    tags = [
+        # Disabled in OSS until nvidia publicly releases a fixed ptxas.
+        "no_oss",
+    ],
+    deps = [
+        ":hlo_test_base",
+        ":xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:test",
+    ],
+)
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 915b456b52215f8d6a9eb6c5b933f3502f1d3d2c..e4cc8b41991927dab815fac2153e525205aad4c8 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -2047,6 +2047,19 @@ XLA_TEST_F(ArrayElementwiseOpTest, NonNanClampF32) {
                              error_spec_);
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, ClampF32) {
+  SetFastMathDisabled(true);
+  XlaBuilder builder(TestName());
+  auto minimum = ConstantR1<float>(&builder, {1.0f, -6.5f, 1.0f, 2.25f, NAN});
+  auto argument =
+      ConstantR1<float>(&builder, {2.0f, 10.0f, -5.0f, 1.0f, 10.0f});
+  auto maximum = ConstantR1<float>(&builder, {3.0f, 0.5f, 25.5f, NAN, 123.0f});
+  Clamp(minimum, argument, maximum);
+
+  ComputeAndCompareR1<float>(&builder, {2.0f, 0.5f, 1.0f, NAN, NAN}, {},
+                             error_spec_);
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, ClampF32Scalar) {
   XlaBuilder builder(TestName());
   auto minimum = ConstantR0<float>(&builder, 0.0f);
diff --git a/tensorflow/compiler/xla/tests/bfloat16_test.cc b/tensorflow/compiler/xla/tests/bfloat16_test.cc
index e9728e636f0ee032416b2da17a3ea83c5bb18083..63e48117056dec4af603cbc85e478fcb15ad0cec 100644
--- a/tensorflow/compiler/xla/tests/bfloat16_test.cc
+++ b/tensorflow/compiler/xla/tests/bfloat16_test.cc
@@ -76,7 +76,9 @@ XLA_TEST_F(Bfloat16Test, NegateScalarF16) {
                                 error_spec_);
 }
 
-XLA_TEST_F(Bfloat16Test, BatchNormTraining) {
+// Disabled on interpreter since BatchNormExanper is not run by default on the
+// intepreter backend.
+XLA_TEST_F(Bfloat16Test, DISABLED_ON_INTERPRETER(BatchNormTraining)) {
   const int kFeatureIndex = 2;
   XlaBuilder builder(TestName());
 
@@ -110,7 +112,9 @@ XLA_TEST_F(Bfloat16Test, BatchNormTraining) {
   ComputeAndCompareTuple(&builder, expected, {}, ErrorSpec(0.01, 0.02));
 }
 
-XLA_TEST_F(Bfloat16Test, BatchNormGrad) {
+// Disabled on interpreter since BatchNormExanper is not run by default on the
+// intepreter backend.
+XLA_TEST_F(Bfloat16Test, DISABLED_ON_INTERPRETER(BatchNormGrad)) {
   const int kFeatureIndex = 2;
   XlaBuilder builder(TestName());
 
diff --git a/tensorflow/compiler/xla/tests/client_test.cc b/tensorflow/compiler/xla/tests/client_test.cc
index d5b3a4d14f932d76b2e7b198ee233756c94a2694..247328b730f3af936d933f824da491b593b27c90 100644
--- a/tensorflow/compiler/xla/tests/client_test.cc
+++ b/tensorflow/compiler/xla/tests/client_test.cc
@@ -109,7 +109,10 @@ XLA_TEST_F(ClientTest, ExecuteWithTupleLayout) {
                                      /*minor_to_major=*/{1, 0})));
 }
 
-XLA_TEST_F(ClientTest, DISABLED_ON_GPU(ExecuteParallel)) {
+// Disabled for interpreter since ExecuteAsyncOnStream is not implemented on
+// interpreter backend.
+XLA_TEST_F(ClientTest,
+           DISABLED_ON_INTERPRETER(DISABLED_ON_GPU(ExecuteParallel))) {
   XlaComputation add_with_one_arg, mul_with_two_args, dot_with_one_arg;
   Shape shape = ShapeUtil::MakeShape(S32, {2, 2});
 
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index 249693891290e14645ee5b4b4d97b2d506a01302..9db9f2563b636c4f929585eb13a9c7f809833eda 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -467,8 +467,8 @@ XLA_TEST_F(ConvolutionTest, Convolve3D_1x4x2x3x3_2x2x2x3x3_Valid) {
 // servers. The error message is missing the operator ++.
 template <typename T>
 void iota_int_init_value(std::vector<T>& values, int init_value) {
-  std::for_each(values.begin(), values.end(),
-                [&](T& value) { value = static_cast<T>(init_value++); });
+  absl::c_for_each(values,
+                   [&](T& value) { value = static_cast<T>(init_value++); });
 }
 
 template <typename T>
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index c5d8b663f4abe77e05ec213d2e4e075c260a8655..2e02968ac5f05c60ed9488d848f6c7fc387b41b4 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
@@ -1147,5 +1148,38 @@ XLA_TEST_F(DotOperationTest, DotRank2AndRank2NonDefaultContractionDims) {
 
   ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
 }
+
+class DotOperationTextTest : public HloTestBase {};
+
+XLA_TEST_F(DotOperationTextTest, DotReorderedDotDims) {
+  absl::string_view hlo_string =
+      R"(
+HloModule ComplexDotMultipleNonContracting
+
+ENTRY %test {
+  %lhs = f32[7,17,10,13]{3,2,1,0} parameter(0)
+  %rhs = f32[7,9,10,13,6]{4,3,2,1,0} parameter(1)
+  ROOT %dot = f32[10,7,17,9,6]{4,3,2,1,0} dot(%lhs, %rhs), lhs_batch_dims={2,0}, rhs_batch_dims={2,0}, lhs_contracting_dims={3}, rhs_contracting_dims={3}
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{1e-3, 1e-3}));
+}
+
+XLA_TEST_F(DotOperationTextTest, DotReorderedDotDimsAndMultipleContracting) {
+  absl::string_view hlo_string =
+      R"(
+HloModule ComplexDotMultipleNonContracting
+
+ENTRY %test {
+  %lhs = f32[7,5,17,10,13]{4,3,2,1,0} parameter(0)
+  %rhs = f32[7,9,10,13,6,5]{5,4,3,2,1,0} parameter(1)
+  ROOT %dot = f32[10,7,17,9,6]{4,3,2,1,0} dot(%lhs, %rhs), lhs_batch_dims={3,0}, rhs_batch_dims={2,0}, lhs_contracting_dims={1,4}, rhs_contracting_dims={5,3}
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{1e-3, 1e-3}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/filecheck.cc b/tensorflow/compiler/xla/tests/filecheck.cc
index dcb469087e0064d17ce3b04fdeaf0b6136069a55..1b0bebe2d03a9a153cd0c80329ed0c49c91333a3 100644
--- a/tensorflow/compiler/xla/tests/filecheck.cc
+++ b/tensorflow/compiler/xla/tests/filecheck.cc
@@ -48,7 +48,7 @@ StatusOr<bool> RunFileCheck(const string& input, const string& pattern) {
 
   tensorflow::SubProcess file_check_process;
   file_check_process.SetProgram(file_check_path,
-                                {file_check_path, pattern_path});
+                                {file_check_path, "-v", pattern_path});
   file_check_process.SetChannelAction(tensorflow::CHAN_STDIN,
                                       tensorflow::ACTION_PIPE);
   file_check_process.SetChannelAction(tensorflow::CHAN_STDERR,
@@ -71,9 +71,7 @@ StatusOr<bool> RunFileCheck(const string& input, const string& pattern) {
       LOG(WARNING) << "NOTE: FileCheck binary does not exist!";
     }
 
-    LOG(WARNING) << "FileCheck error: " << standard_error;
-    LOG(WARNING) << "FileCheck input was:";
-    XLA_LOG_LINES(tensorflow::WARNING, input);
+    LOG(WARNING) << "FileCheck error:\n" << standard_error;
     LOG(WARNING) << "FileCheck pattern was:";
     XLA_LOG_LINES(tensorflow::WARNING, pattern);
   } else if (!standard_error.empty()) {
diff --git a/tensorflow/compiler/xla/tests/gather_operation_test.cc b/tensorflow/compiler/xla/tests/gather_operation_test.cc
index daa89398a697af9149797d621c3bdca80a00aedd..d65b67a535d43553a3a94f76482ad4618f9b8aab 100644
--- a/tensorflow/compiler/xla/tests/gather_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/gather_operation_test.cc
@@ -600,7 +600,9 @@ ENTRY main {
 
 class GatherClientLibraryTest : public ClientLibraryTestBase {};
 
-XLA_TEST_F(GatherClientLibraryTest, DISABLED_ON_GPU(Basic)) {
+// Disabled on interpreter since ExectuteAsyncOnStream is not supported.
+XLA_TEST_F(GatherClientLibraryTest,
+           DISABLED_ON_INTERPRETER(DISABLED_ON_GPU(Basic))) {
   // We create this HLO, but using the XlaBuilder API.
   //
   // ENTRY main {
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index d57846e19bb80c5b9c87d50596da2915f9aef317..66f72ba8d20b8ef1f436da4425b2bb6518ee9a94 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -139,7 +139,8 @@ std::unique_ptr<VerifiedHloModule> HloTestBase::CreateNewVerifiedModule(
     const string& name) {
   return absl::make_unique<VerifiedHloModule>(
       name, GetModuleConfigForTest(), verifier_layout_sensitive_,
-      allow_mixed_precision_in_hlo_verifier_);
+      allow_mixed_precision_in_hlo_verifier_,
+      backend().compiler()->ShapeSizeBytesFunction());
 }
 
 StatusOr<std::unique_ptr<VerifiedHloModule>>
@@ -147,7 +148,8 @@ HloTestBase::ParseAndReturnVerifiedModule(absl::string_view hlo_text,
                                           const HloModuleConfig& config) {
   auto module = absl::make_unique<VerifiedHloModule>(
       TestName(), config, verifier_layout_sensitive_,
-      allow_mixed_precision_in_hlo_verifier_);
+      allow_mixed_precision_in_hlo_verifier_,
+      backend().compiler()->ShapeSizeBytesFunction());
   TF_RETURN_IF_ERROR(ParseHloString(hlo_text, module.get()));
   TF_RETURN_IF_ERROR(module->Verify());
   return std::move(module);
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 1d1e7f437296a7493ef7da07039fcf6d273f35bc..69a4f96288c7285010e9adbdc33f1b394f58d8d2 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -46,10 +46,12 @@ class VerifiedHloModule : public HloModule {
  public:
   VerifiedHloModule(const string& name, const HloModuleConfig& config,
                     bool verifier_layout_sensitive,
-                    bool allow_mixed_precision_in_hlo_verifier)
+                    bool allow_mixed_precision_in_hlo_verifier,
+                    std::function<int64(const Shape&)> shape_size_function)
       : HloModule(name, config),
-        verifier_(verifier_layout_sensitive,
-                  allow_mixed_precision_in_hlo_verifier) {}
+        verifier_(
+            verifier_layout_sensitive, allow_mixed_precision_in_hlo_verifier,
+            /*instruction_can_change_layout_func=*/{}, shape_size_function) {}
 
   ~VerifiedHloModule() override { VerifyOrAddFailure("in destructor"); }
 
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index 6522c563252a0e8271bf733668f39bcf00a80d06..96527886b718bc1ea4ce8cc2d7dbeb2e3ef1d1eb 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -842,7 +842,8 @@ XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion64bit) {
        LiteralUtil::CreateR0<int64>(123456789000LL)}));
 }
 
-XLA_TEST_F(LocalClientExecuteTest, InfeedTest) {
+// Disabled on interpreter backend since infeed HLO is unsupported.
+XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_INTERPRETER(InfeedTest)) {
   XlaBuilder builder(TestName());
   const Shape shape = ShapeUtil::MakeShape(F32, {3});
   auto in = Infeed(&builder, shape);
@@ -867,7 +868,8 @@ XLA_TEST_F(LocalClientExecuteTest, InfeedTest) {
   LiteralTestUtil::ExpectR1Equal<float>({-4.0, 125.0, 45.0}, result);
 }
 
-XLA_TEST_F(LocalClientExecuteTest, InfeedOutfeedTest) {
+// Disabled on interpreter backend since infeed/outfeed HLOs are unsupported.
+XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_INTERPRETER(InfeedOutfeedTest)) {
   XlaBuilder builder(TestName());
   const Shape shape = ShapeUtil::MakeShape(F32, {3});
   auto in = Infeed(&builder, shape);
diff --git a/tensorflow/compiler/xla/tests/prng_test.cc b/tensorflow/compiler/xla/tests/prng_test.cc
index 8f2c26f0eea9c7a3b33cd77e5977924c1659535a..e49bcf26bd6e50f8fb36c86f217907b5d4901eae 100644
--- a/tensorflow/compiler/xla/tests/prng_test.cc
+++ b/tensorflow/compiler/xla/tests/prng_test.cc
@@ -80,7 +80,9 @@ XLA_TEST_F(PrngTest, LargeU01) { UniformTest<float>(0, 1, {0x100, 0x100}); }
 XLA_TEST_F(PrngTest, TwelveValuesU524) { UniformTest<int32>(5, 24, {12}); }
 
 // TODO(b/71543667): Fix Rng ops on LLVM backends.
-XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU(ScalarBF16Tests))) {
+// TODO(b/122047800): Interpreter does not support BF16 for RNG ops.
+XLA_TEST_F(PrngTest, DISABLED_ON_INTERPRETER(
+                         DISABLED_ON_GPU(DISABLED_ON_CPU(ScalarBF16Tests)))) {
   for (int64 seed = 0; seed < 100; ++seed) {
     // The largest negative number smaller than zero in bf16 that's not
     // denormalized.
@@ -103,7 +105,9 @@ XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU(ScalarBF16Tests))) {
 }
 
 // TODO(b/71543667): Fix Rng ops on LLVM backends.
-XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU(ScalarBF16CountTests))) {
+// TODO(b/122047800): Interpreter does not support BF16 for RNG ops.
+XLA_TEST_F(PrngTest, DISABLED_ON_INTERPRETER(DISABLED_ON_GPU(
+                         DISABLED_ON_CPU(ScalarBF16CountTests)))) {
   // There are 3 BF16 values in the range of [32.25, 33): 32.25, 32.5, 32.75,
   // they should get similar counts.
   bfloat16 low = static_cast<bfloat16>(32.25);
@@ -276,6 +280,39 @@ XLA_TEST_F(PrngTest, PassInGlobalRngSeed) {
   EXPECT_FALSE(LiteralTestUtil::Equal(result5, result6));
 }
 
+// This test verifies that the two RNG instructions with the same parameters in
+// the same HloComputation produces different values.
+XLA_TEST_F(PrngTest, DifferentValuesForIdenticalRngNodesInSameComputation) {
+  // Build a U[0,1) computation.
+  auto build_computation = [this]() {
+    XlaBuilder builder(TestName());
+    auto a = RngUniform(ConstantR0<int32>(&builder, 0),
+                        ConstantR0<int32>(&builder, 100),
+                        ShapeUtil::MakeShape(S32, {10}));
+    auto b = RngUniform(ConstantR0<int32>(&builder, 0),
+                        ConstantR0<int32>(&builder, 100),
+                        ShapeUtil::MakeShape(S32, {10}));
+    Tuple(&builder, {a, b});
+    return builder.Build();
+  };
+
+  ExecutionOptions execution_options = execution_options_;
+  execution_options.set_seed(42);
+
+  Literal result_tuple;
+  {
+    TF_ASSERT_OK_AND_ASSIGN(auto computation, build_computation());
+    TF_ASSERT_OK_AND_ASSIGN(
+        result_tuple, client_->ExecuteAndTransfer(computation, /*arguments=*/{},
+                                                  &execution_options));
+  }
+
+  auto results = result_tuple.DecomposeTuple();
+  ASSERT_EQ(results.size(), 2);
+
+  EXPECT_FALSE(LiteralTestUtil::Equal(results[0], results[1]));
+}
+
 XLA_TEST_F(PrngTest, TenValuesN01) {
   XlaBuilder builder(TestName());
   RngNormal(ConstantR0<float>(&builder, 0), ConstantR0<float>(&builder, 1),
diff --git a/tensorflow/compiler/xla/tests/ptxas_bug_120501638.cc b/tensorflow/compiler/xla/tests/ptxas_bug_120501638.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0e5d7db97e88936e7336ed02a5c7a1171254b0cf
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/ptxas_bug_120501638.cc
@@ -0,0 +1,82 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+namespace {
+
+class PtxasBugTest : public HloTestBase {};
+
+// Checks for a bug in ptxas, tracked as Google bug 120501638, and nvidia bug
+// 2459377.  We never received an explanation of what exactly was going wrong
+// here in ptxas.  Known-bad in ptxas 10.0.145, known-good in ptxas 10.0.249.
+TEST_F(PtxasBugTest, DoIt) {
+  const char* const kModuleStr = R"(
+HloModule test
+
+add_F32.14 {
+  lhs.15 = f32[] parameter(0)
+  rhs.16 = f32[] parameter(1)
+  ROOT add.17 = f32[] add(lhs.15, rhs.16)
+}
+
+ENTRY testcase {
+  arg0.1 = f32[2,5,2]{2,1,0} parameter(0)
+  reshape.2 = f32[2,5,2]{2,1,0} reshape(arg0.1)
+  constant.3 = f32[] constant(0)
+  pad.4 = f32[2,6,2]{2,1,0} pad(reshape.2, constant.3), padding=0_0x0_1x0_0
+  reshape.5 = f32[2,3,2,2]{3,2,1,0} reshape(pad.4)
+  transpose.6 = f32[2,2,3,2]{3,0,2,1} transpose(reshape.5), dimensions={2,0,1,3}
+  reshape.7 = f32[4,3,2]{2,1,0} reshape(transpose.6)
+  reshape.8 = f32[4,1,3,2]{3,2,1,0} reshape(reshape.7)
+  transpose.9 = f32[4,2,1,3]{1,3,2,0} transpose(reshape.8), dimensions={0,3,1,2}
+  convert.10 = f32[4,2,1,3]{1,3,2,0} convert(transpose.9)
+  constant.12 = f32[] constant(0)
+  pad.13 = f32[4,2,1,3]{3,2,1,0} pad(convert.10, constant.12), padding=0_0x0_0x0_0x0_0
+  constant.11 = f32[] constant(0)
+  reduce-window.18 = f32[4,2,1,3]{3,2,1,0} reduce-window(pad.13, constant.11),
+    window={size=1x1x1x1}, to_apply=add_F32.14
+  constant.19 = f32[] constant(1)
+  broadcast.20 = f32[4,2,1,3]{3,2,1,0} broadcast(constant.19), dimensions={}
+  divide.21 = f32[4,2,1,3]{3,2,1,0} divide(reduce-window.18, broadcast.20)
+  convert.22 = f32[4,2,1,3]{3,2,1,0} convert(divide.21)
+  transpose.23 = f32[4,1,3,2]{2,1,3,0} transpose(convert.22), dimensions={0,2,3,1}
+  reshape.24 = f32[4,3,2]{2,1,0} reshape(transpose.23)
+  reshape.25 = f32[2,2,3,2]{3,2,1,0} reshape(reshape.24)
+  transpose.26 = f32[2,3,2,2]{3,1,0,2} transpose(reshape.25), dimensions={1,2,0,3}
+  reshape.27 = f32[2,6,2]{2,1,0} reshape(transpose.26)
+  slice.28 = f32[2,5,2]{2,1,0} slice(reshape.27), slice={[0:2], [0:5], [0:2]}
+  reshape.29 = f32[2,5,2]{2,1,0} reshape(slice.28)
+  tuple.30 = (f32[2,5,2]{2,1,0}) tuple(reshape.29)
+  ROOT get-tuple-element.31 = f32[2,5,2]{2,1,0} get-tuple-element(tuple.30), index=0
+})";
+
+  // Create a module with the true-default flags, not the default-for-testing
+  // flags.  In particular, true-default flags enable unrolling, whereas for
+  // testing we disable unrolling, and this bug doesn't trigger without
+  // unrolling.
+  HloModuleConfig config;
+  config.set_debug_options(DefaultDebugOptionsIgnoringFlags());
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr, config));
+  EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec{0.01, 0.01}));
+}
+
+}  // anonymous namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/test_macros.h b/tensorflow/compiler/xla/tests/test_macros.h
index 7ca99a91635e85cd0888e59ecde31e47fec21844..80a6868485c9162d1cb0de24f0adf3f1c1d2503a 100644
--- a/tensorflow/compiler/xla/tests/test_macros.h
+++ b/tensorflow/compiler/xla/tests/test_macros.h
@@ -79,30 +79,28 @@ string PrependDisabledIfIndicated(const string& test_case_name,
 // heuristic to decide whether the test case should be disabled, and we
 // determine whether the test case should be disabled by resolving the (test
 // case name, test name) in a manifest file.
-#define XLA_GTEST_TEST_(test_case_name, test_name, parent_class, parent_id)   \
-  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name)                     \
-      : public parent_class {                                                 \
-   public:                                                                    \
-    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}                    \
-                                                                              \
-   private:                                                                   \
-    virtual void TestBody();                                                  \
-    static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;     \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_case_name,    \
-                                                           test_name));       \
-  };                                                                          \
-                                                                              \
-  ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name,           \
-                                                    test_name)::test_info_ =  \
-      ::testing::internal::MakeAndRegisterTestInfo(                           \
-          #test_case_name,                                                    \
-          ::xla::PrependDisabledIfIndicated(#test_case_name, #test_name)      \
-              .c_str(),                                                       \
-          nullptr, nullptr,                                                   \
-          ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id), \
-          parent_class::SetUpTestCase, parent_class::TearDownTestCase,        \
-          new ::testing::internal::TestFactoryImpl<GTEST_TEST_CLASS_NAME_(    \
-              test_case_name, test_name)>);                                   \
+#define XLA_GTEST_TEST_(test_case_name, test_name, parent_class)             \
+  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name)                    \
+      : public parent_class {                                                \
+   public:                                                                   \
+    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}                   \
+                                                                             \
+   private:                                                                  \
+    virtual void TestBody();                                                 \
+    static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;    \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_case_name,   \
+                                                           test_name));      \
+  };                                                                         \
+                                                                             \
+  ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name,          \
+                                                    test_name)::test_info_ = \
+      ::testing::RegisterTest(                                               \
+          #test_case_name,                                                   \
+          ::xla::PrependDisabledIfIndicated(#test_case_name, #test_name)     \
+              .c_str(),                                                      \
+          nullptr, nullptr, __FILE__, __LINE__, []() -> parent_class* {      \
+            return new GTEST_TEST_CLASS_NAME_(test_case_name, test_name)();  \
+          });                                                                \
   void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
 
 // This is identical to the TEST_F macro from "gtest", but it potentially
@@ -111,9 +109,8 @@ string PrependDisabledIfIndicated(const string& test_case_name,
 // Per usual, you can see what tests are available via --gunit_list_tests and
 // choose to run tests that have been disabled via the manifest via
 // --gunit_also_run_disabled_tests.
-#define XLA_TEST_F(test_fixture, test_name)              \
-  XLA_GTEST_TEST_(test_fixture, test_name, test_fixture, \
-                  ::testing::internal::GetTypeId<test_fixture>())
+#define XLA_TEST_F(test_fixture, test_name) \
+  XLA_GTEST_TEST_(test_fixture, test_name, test_fixture)
 
 // Likewise, this is identical to the TEST_P macro from "gtest", but
 // potentially disables the test based on the DISABLED_MANIFEST file.
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 290886624de9f9fcca533d01f15d4a3e4c23d7ee..95c89b0ba6f29c453abab88e29bca13ee006455a 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
@@ -274,16 +275,9 @@ bool NeedsInitValue(const HloUse& use) {
 
 // Generate random values that are constrained to the input_shape minus the
 // output_shape so as not to produce wrapping slices, for instance.
-Literal MakeRandomIndex(absl::Span<const int64> index_space,
-                        std::minstd_rand0* engine) {
-  std::vector<int32> start_indices(index_space.size());
-  if (engine != nullptr) {
-    for (int i = 0; i < index_space.size(); ++i) {
-      std::uniform_int_distribution<int32> generator(0, index_space[i]);
-      start_indices[i] = generator(*engine);
-    }
-  }
-  return LiteralUtil::CreateR1<int32>(start_indices);
+Literal MakeRandomIndex(int64 index_bound, std::minstd_rand0* engine) {
+  std::uniform_int_distribution<int32> generator(0, index_bound);
+  return LiteralUtil::CreateR0<int32>(generator(*engine));
 }
 
 // Use dataflow analysis on each parameter to see if there are uses that would
@@ -300,8 +294,8 @@ std::vector<HloInstruction*> FindConstrainedUses(
       HloInstruction* instruction = use.instruction;
       const HloOpcode opcode = instruction->opcode();
       const int64 op_num = use.operand_number;
-      if ((opcode == HloOpcode::kDynamicSlice && op_num == 1) ||
-          (opcode == HloOpcode::kDynamicUpdateSlice && op_num == 2)) {
+      if ((opcode == HloOpcode::kDynamicSlice && op_num >= 1) ||
+          (opcode == HloOpcode::kDynamicUpdateSlice && op_num >= 2)) {
         constrained_uses.push_back(instruction);
       } else if (opcode == HloOpcode::kFusion) {
         const HloInstruction* const to_analyze =
@@ -336,7 +330,7 @@ std::vector<HloInstruction*> FindConstrainedUses(
 StatusOr<Literal> CreateLiteralForConstrainedUses(
     const absl::Span<HloInstruction* const> constrained_uses,
     const HloInstruction& param, std::minstd_rand0* engine) {
-  std::vector<int64> index_space;
+  int64 index_bound = INT64_MAX;
   bool no_duplicates = false;
   bool needs_constant = false;
   ConstantType constant_type = ConstantType::kUnknown;
@@ -348,19 +342,16 @@ StatusOr<Literal> CreateLiteralForConstrainedUses(
         const Shape& slice_shape = use->opcode() == HloOpcode::kDynamicSlice
                                        ? use->shape()
                                        : use->operand(1)->shape();
-        const int64 rank = indexed_shape.rank();
-        if (!index_space.empty()) {
-          TF_RET_CHECK(rank == index_space.size());
-          for (int64 i = 0; i < rank; ++i) {
-            index_space[i] = std::min(
-                index_space[i], ShapeUtil::GetDimension(indexed_shape, i) -
-                                    ShapeUtil::GetDimension(slice_shape, i));
-          }
-        } else {
-          index_space.resize(rank);
-          for (int64 i = 0; i < rank; ++i) {
-            index_space[i] = ShapeUtil::GetDimension(indexed_shape, i) -
-                             ShapeUtil::GetDimension(slice_shape, i);
+        const int64 first_index =
+            Cast<HloDynamicIndexInstruction>(use)->first_index_operand_number();
+        for (int64 operand = first_index; operand < use->operand_count();
+             ++operand) {
+          if (use->operand(operand) == &param) {
+            index_bound = std::min(
+                index_bound,
+                ShapeUtil::GetDimension(indexed_shape, operand - first_index) -
+                    ShapeUtil::GetDimension(slice_shape,
+                                            operand - first_index));
           }
         }
         break;
@@ -388,13 +379,14 @@ StatusOr<Literal> CreateLiteralForConstrainedUses(
   }
   int constraint_count = 0;
   constraint_count += no_duplicates ? 1 : 0;
-  constraint_count += !index_space.empty() ? 1 : 0;
+  constraint_count += (index_bound != INT64_MAX) ? 1 : 0;
   constraint_count += needs_constant ? 1 : 0;
   if (constraint_count > 1) {
     return Unimplemented("Conflicting operand generation constraints.");
   }
-  if (!index_space.empty()) {
-    return MakeRandomIndex(index_space, engine);
+  if (index_bound != INT64_MAX) {
+    return MakeRandomIndex(index_bound, engine)
+        .Reshape(param.shape().dimensions());
   } else if (needs_constant) {
     switch (constant_type) {
       case ConstantType::kZero:
diff --git a/tensorflow/compiler/xla/tests/test_utils_test.cc b/tensorflow/compiler/xla/tests/test_utils_test.cc
index 448a66cfdd897b17cce1c87c050520a2f2eb0ea2..591d6c19228a313f530cdae18f4be37e7b517601 100644
--- a/tensorflow/compiler/xla/tests/test_utils_test.cc
+++ b/tensorflow/compiler/xla/tests/test_utils_test.cc
@@ -79,25 +79,26 @@ XLA_TEST_F(TestUtilsTest, MultipleIndexSpacesForDynamicSlices) {
                     R"(HloModule index_space_module
 
     ENTRY IndexSpace {
-      index_param = s32[3]{0} parameter(0)
-      array_param.1 = f32[123,4,789]{0,1,2} parameter(1)
-      array_param.2 = f32[3,3000,5]{0,1,2} parameter(2)
-      dynamic-slice.1 = f32[1,2,3] dynamic-slice(array_param.1, index_param), dynamic_slice_sizes={1,2,3}
-      ROOT dynamic-slice.2 = f32[3,2,2] dynamic-slice(array_param.2, index_param), dynamic_slice_sizes={3,2,2}
+      index_param.0 = s32[] parameter(0)
+      index_param.1 = s32[] parameter(1)
+      index_param.2 = s32[] parameter(2)
+      array_param.1 = f32[123,4,789]{0,1,2} parameter(3)
+      array_param.2 = f32[3,3000,5]{0,1,2} parameter(4)
+      dynamic-slice.1 = f32[1,2,3] dynamic-slice(array_param.1, index_param.0, index_param.1, index_param.2), dynamic_slice_sizes={1,2,3}
+      ROOT dynamic-slice.2 = f32[3,2,2] dynamic-slice(array_param.2, index_param.0, index_param.1, index_param.2), dynamic_slice_sizes={3,2,2}
     })")
                     .ValueOrDie();
   TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> args,
                           MakeFakeArguments(module.get()));
-  ASSERT_EQ(args.size(), 3);
-  const Literal& index_arg = args[0];
+  ASSERT_EQ(args.size(), 5);
 
-  EXPECT_EQ(index_arg.Get<int32>({0}), 0);
+  EXPECT_EQ(args[0].Get<int32>({}), 0);
 
-  EXPECT_GE(index_arg.Get<int32>({1}), 0);
-  EXPECT_LE(index_arg.Get<int32>({1}), 2);
+  EXPECT_GE(args[1].Get<int32>({}), 0);
+  EXPECT_LE(args[0].Get<int32>({}), 2);
 
-  EXPECT_GE(index_arg.Get<int32>({2}), 0);
-  EXPECT_LE(index_arg.Get<int32>({2}), 3);
+  EXPECT_GE(args[2].Get<int32>({}), 0);
+  EXPECT_LE(args[2].Get<int32>({}), 3);
 }
 
 XLA_TEST_F(TestUtilsTest, MultipleIndexSpacesForDynamicUpdateSlices) {
@@ -105,28 +106,29 @@ XLA_TEST_F(TestUtilsTest, MultipleIndexSpacesForDynamicUpdateSlices) {
                     R"(HloModule index_space_module
 
     ENTRY IndexSpace {
-      index_param = s32[3]{0} parameter(0)
-      array_param.1 = f32[123,4,789]{0,1,2} parameter(1)
-      array_param.2 = f32[3,3000,5]{0,1,2} parameter(2)
-      update_param.1 = f32[1,2,3]{0,1,2} parameter(3)
-      update_param.2 = f32[3,2,2]{0,1,2} parameter(4)
-
-      dynamic-update-slice.1 = f32[123,4,789] dynamic-update-slice(array_param.1, update_param.1, index_param)
-      ROOT dynamic-update-slice.2 = f32[3,3000,5] dynamic-update-slice(array_param.2, update_param.2, index_param)
+      index_param.0 = s32[] parameter(0)
+      index_param.1 = s32[] parameter(1)
+      index_param.2 = s32[] parameter(2)
+      array_param.1 = f32[123,4,789]{0,1,2} parameter(3)
+      array_param.2 = f32[3,3000,5]{0,1,2} parameter(4)
+      update_param.1 = f32[1,2,3]{0,1,2} parameter(5)
+      update_param.2 = f32[3,2,2]{0,1,2} parameter(6)
+
+      dynamic-update-slice.1 = f32[123,4,789] dynamic-update-slice(array_param.1, update_param.1, index_param.0, index_param.1, index_param.2)
+      ROOT dynamic-update-slice.2 = f32[3,3000,5] dynamic-update-slice(array_param.2, update_param.2, index_param.0, index_param.1, index_param.2)
     })")
                     .ValueOrDie();
   TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> args,
                           MakeFakeArguments(module.get()));
-  ASSERT_EQ(args.size(), 5);
-  const Literal& index_arg = args[0];
+  ASSERT_EQ(args.size(), 7);
 
-  EXPECT_EQ(index_arg.Get<int32>({0}), 0);
+  EXPECT_EQ(args[0].Get<int32>({}), 0);
 
-  EXPECT_GE(index_arg.Get<int32>({1}), 0);
-  EXPECT_LE(index_arg.Get<int32>({1}), 2);
+  EXPECT_GE(args[1].Get<int32>({}), 0);
+  EXPECT_LE(args[0].Get<int32>({}), 2);
 
-  EXPECT_GE(index_arg.Get<int32>({2}), 0);
-  EXPECT_LE(index_arg.Get<int32>({2}), 3);
+  EXPECT_GE(args[2].Get<int32>({}), 0);
+  EXPECT_LE(args[2].Get<int32>({}), 3);
 }
 
 XLA_TEST_F(TestUtilsTest, NoDuplicatesFloats) {
@@ -198,5 +200,33 @@ ENTRY %sort. (parameter.0: bf16[2,1452], parameter.1: s32[2,1452]) -> (bf16[2,14
   }
 }
 
+XLA_TEST_F(TestUtilsTest, MakeFakeArgumentsR0InputToDynamicSlice) {
+  auto module = ParseHloString(R"(
+HloModule Test
+
+ENTRY %module (parameter.0: s32[], parameter.1: f32[20,20]) -> f32[] {
+  %parameter.1 = f32[20,20]{1,0} parameter(1)
+  %constant.1 = s32[1]{0} constant({0})
+  %parameter.0 = s32[] parameter(0)
+  %bitcast.3 = s32[1]{0} bitcast(s32[] %parameter.0)
+  %concatenate.1 = s32[2]{0} concatenate(s32[1]{0} %constant.1, s32[1]{0} %bitcast.3), dimensions={0}
+  %dynamic-slice.2 = f32[20,1]{1,0} dynamic-slice(f32[20,20]{1,0} %parameter.1, s32[2]{0} %concatenate.1), dynamic_slice_sizes={20,1}
+  %bitcast.4 = f32[20]{0} bitcast(f32[20,1]{1,0} %dynamic-slice.2)
+  %dynamic-slice.3 = f32[1]{0} dynamic-slice(f32[20]{0} %bitcast.4, s32[1]{0} %bitcast.3), dynamic_slice_sizes={1}
+  ROOT %bitcast.5 = f32[] bitcast(f32[1]{0} %dynamic-slice.3)
+}
+)")
+                    .ValueOrDie();
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> args,
+                          MakeFakeArguments(module.get()));
+  ASSERT_EQ(args.size(), 2);
+  EXPECT_TRUE(ShapeUtil::Equal(args[0].shape(), ShapeUtil::MakeShape(S32, {})))
+      << ShapeUtil::HumanString(args[0].shape());
+  EXPECT_TRUE(
+      ShapeUtil::Equal(args[1].shape(), ShapeUtil::MakeShape(F32, {20, 20})))
+      << ShapeUtil::HumanString(args[1].shape());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index 9c586bdeb05afb7378e92caed1f3edc408e051bf..426d6c84ee3a9e30bdba1da7ae570ed0279a8440 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -176,8 +176,9 @@ XLA_TEST_F(TupleTest, AddTupleElements) {
       {2.f, 4.f, 6.f},  // row 0
       {5.f, 7.f, 9.f},  // row 1
   });
-  ASSERT_TRUE(ShapeUtil::ShapeIs(vector_shape, F32, {3}));
-  ASSERT_TRUE(ShapeUtil::ShapeIs(matrix_shape, F32, {/*y=*/2, /*x=*/3}));
+  ASSERT_TRUE(ShapeUtil::Equal(vector_shape, ShapeUtil::MakeShape(F32, {3})));
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_shape,
+                               ShapeUtil::MakeShape(F32, {/*y=*/2, /*x=*/3})));
   ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
 }
 
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index 1538f2afbafad53fa2f7a3fd1a4705e5ab55f536..c7337e8caae8f2ee25f4b25dc22439e08d2ecc25 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -174,9 +174,8 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
   exec_run_options.set_allocator(backend->memory_allocator());
   exec_run_options.set_intra_op_thread_pool(
       backend->eigen_intra_op_thread_pool_device());
-  ServiceExecutableRunOptions run_options(
-      exec_run_options, /*borrow_stream=*/nullptr,
-      backend->eigen_intra_op_thread_pool());
+  ServiceExecutableRunOptions run_options(exec_run_options,
+                                          /*borrow_stream=*/nullptr);
   std::vector<const ShapedBuffer*> args = {&lhs_arg, &rhs_arg};
   TF_ASSERT_OK_AND_ASSIGN(
       auto execution_result,
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index 68cab7387cf1576072f96878b50f07def6862d8b..34b73b5206fa20d6dff7567afd78fd89897c8c33 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -86,7 +86,7 @@ bool IsPermutation(absl::Span<const int64> permutation, int64 rank) {
     CHECK_LT(index, rank);
     output[index] = 0;
   }
-  return std::find(output.begin(), output.end(), -1) == output.end();
+  return !absl::c_linear_search(output, -1);
 }
 
 std::vector<int64> InversePermutation(
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 6722641e9d2c177440361e6f0d1f6c0804eb7cda..f2fd17dc99455a921bf875aad2a3661b4d456823 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -324,8 +324,7 @@ bool IsIdentityPermutation(absl::Span<const int64> permutation);
 
 template <typename Container>
 int64 PositionInContainer(const Container& container, int64 value) {
-  return std::distance(container.begin(),
-                       std::find(container.begin(), container.end(), value));
+  return std::distance(container.begin(), absl::c_find(container, value));
 }
 
 // Formats the container as a comma-separated string. StrAppend must support
diff --git a/tensorflow/compiler/xla/window_util.cc b/tensorflow/compiler/xla/window_util.cc
index 51c73b3d17e4c32d9a8a14d3055ab56f02922af3..e001cc35f9fcea2783b3952e825838af6bbece72 100644
--- a/tensorflow/compiler/xla/window_util.cc
+++ b/tensorflow/compiler/xla/window_util.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -137,25 +138,23 @@ bool HasPadding(const Window& window) {
 }
 
 bool HasSymmetricPadding(const Window& window) {
-  return std::all_of(window.dimensions().begin(), window.dimensions().end(),
-                     [](const WindowDimension& dim) {
-                       return dim.padding_low() == dim.padding_high();
-                     });
+  return absl::c_all_of(window.dimensions(), [](const WindowDimension& dim) {
+    return dim.padding_low() == dim.padding_high();
+  });
 }
 
 bool HasSymmetricPadding(const PaddingConfig& padding_config) {
-  return std::all_of(padding_config.dimensions().begin(),
-                     padding_config.dimensions().end(),
-                     [](const PaddingConfig::PaddingConfigDimension& dim) {
-                       return dim.edge_padding_low() == dim.edge_padding_high();
-                     });
+  return absl::c_all_of(padding_config.dimensions(),
+                        [](const PaddingConfig::PaddingConfigDimension& dim) {
+                          return dim.edge_padding_low() ==
+                                 dim.edge_padding_high();
+                        });
 }
 
 bool HasNegativePadding(const Window& window) {
-  return std::any_of(window.dimensions().begin(), window.dimensions().end(),
-                     [](const WindowDimension& dim) {
-                       return dim.padding_low() < 0 || dim.padding_high() < 0;
-                     });
+  return absl::c_any_of(window.dimensions(), [](const WindowDimension& dim) {
+    return dim.padding_low() < 0 || dim.padding_high() < 0;
+  });
 }
 
 bool HasBaseDilation(const Window& window) {
@@ -190,10 +189,9 @@ bool AllOrNoneReversed(const Window& window) {
     return true;
   }
   bool reversed = window.dimensions()[0].window_reversal();
-  return std::all_of(window.dimensions().begin(), window.dimensions().end(),
-                     [&](const WindowDimension& dim) {
-                       return dim.window_reversal() == reversed;
-                     });
+  return absl::c_all_of(window.dimensions(), [&](const WindowDimension& dim) {
+    return dim.window_reversal() == reversed;
+  });
 }
 
 bool HasDilation(const Window& window) {
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 0e8fa73f8170addfa5061b33f3d6882a13890bce..e2d7b6ef4666c533951960fd3dcf6869ec2b52c5 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -230,7 +230,11 @@ message DebugOptions {
   // Enable fast math with eigen in the HLO evaluator.
   bool xla_hlo_evaluator_use_fast_path = 106;
 
-  // Next id: 107
+  // Temporary option to allow support for both the R1 and the scalar index
+  // versions of DynamicSlice and DynamicUpdateSlice. Only used for testing.
+  bool xla_allow_scalar_index_dynamic_ops = 107;
+
+  // Next id: 108
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
diff --git a/tensorflow/compiler/xrt/kernels/BUILD b/tensorflow/compiler/xrt/kernels/BUILD
index 67f475846e5f16060c1080759b0acb4216c4e72b..78d093ec4a5db49002bd987aff1c6d5ca2a3a0c6 100644
--- a/tensorflow/compiler/xrt/kernels/BUILD
+++ b/tensorflow/compiler/xrt/kernels/BUILD
@@ -62,7 +62,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/stream_executor:stream_executor_headers_lib",
+        "//tensorflow/stream_executor:stream_executor_headers",
         "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
diff --git a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
index 2ccdf0f02d840600d5e0649c4805e3672d4a1286..2ee1a6cd1aebcdbd65892b33e5044489070ab5c4 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
@@ -215,11 +215,6 @@ XRTReleaseCompilationRefOp::~XRTReleaseCompilationRefOp() = default;
 void XRTReleaseCompilationRefOp::Compute(OpKernelContext* ctx) {
   VLOG(1) << "XRTReleaseCompilationRefOp::Compute";
 
-  const Tensor& key_tensor = ctx->input(0);
-  OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(key_tensor.shape()),
-              errors::Internal("computation key should be a string scalar"));
-  int64 uid = key_tensor.scalar<int64>()();
-
   ResourceMgr* rm;
   OP_REQUIRES_OK(ctx, XRTGenericDeviceAccessor::GetResourceManager(ctx, &rm));
 
@@ -230,9 +225,13 @@ void XRTReleaseCompilationRefOp::Compute(OpKernelContext* ctx) {
                           kXRTCompilationCacheResourceName, &cache));
   core::ScopedUnref cache_unref(cache);
 
-  OP_REQUIRES_OK(ctx, cache->Release(uid));
-
-  VLOG(2) << "Released computation handle " << uid;
+  const Tensor& keys_tensor = ctx->input(0);
+  auto flat_keys = keys_tensor.flat<int64>();
+  for (int64 i = 0; i < flat_keys.size(); ++i) {
+    int64 key = flat_keys(i);
+    OP_REQUIRES_OK(ctx, cache->Release(key));
+    VLOG(2) << "Released computation handle " << key;
+  }
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
index 2e2f3ff116a7b331df8dbd58a9fe40096f524140..c8def16bbc05bbc41ef2faf98a01c1d5d758890f 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
@@ -453,17 +453,17 @@ class XRTReleaseAllocationOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     VLOG(1) << "XRTReleaseAllocationOp::Compute";
 
-    const Tensor& allocation_handle = ctx->input(0);
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(allocation_handle.shape()),
-                errors::Internal("handle input should be an int64 scalar"));
-    int64 key = allocation_handle.scalar<int64>()();
-
     ResourceMgr* rm;
     OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
 
-    OP_REQUIRES_OK(ctx, XRTTupleAllocation::DeleteFromResourceManager(rm, key));
-
-    VLOG(2) << "Released allocation handle " << key;
+    const Tensor& allocation_handle = ctx->input(0);
+    auto flat_keys = allocation_handle.flat<int64>();
+    for (int64 i = 0; i < flat_keys.size(); ++i) {
+      int64 key = flat_keys(i);
+      OP_REQUIRES_OK(ctx,
+                     XRTTupleAllocation::DeleteFromResourceManager(rm, key));
+      VLOG(2) << "Released allocation handle " << key;
+    }
   }
 };
 
diff --git a/tensorflow/compiler/xrt/ops/xrt_compile_ops.cc b/tensorflow/compiler/xrt/ops/xrt_compile_ops.cc
index 7b3b50c69559f6003a108fdf6a1325dbdbaa80a6..9dd964e5467cd855d67764a512e95a6a18f482e1 100644
--- a/tensorflow/compiler/xrt/ops/xrt_compile_ops.cc
+++ b/tensorflow/compiler/xrt/ops/xrt_compile_ops.cc
@@ -44,10 +44,10 @@ REGISTER_OP("XRTReleaseCompilationHandle")
     .SetShapeFn(tensorflow::shape_inference::NoOutputs)
     .Doc(
         R"(
-Discards a computation from the compilation cache. The handle cannot be
-subsequently used.
+Discards one or more computation handles from the compilation cache.
+The handle(s) cannot be subsequently used.
 
-'handle' is an id returned from a XRTCompile Op.
+'handle' is an ID (or vector of IDs) returned from a XRTCompile Op.
 )");
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
index fe6bee0dacf5dc2050613fc9ad34d3235b5a7b63..db58f0797df67774f4b74f77beef9a595342fe3f 100644
--- a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
@@ -127,10 +127,11 @@ REGISTER_OP("XRTReleaseAllocationHandle")
     .SetShapeFn(tensorflow::shape_inference::NoOutputs)
     .Doc(
         R"(
-Discards an allocation from device memory. The handle cannot be subsequently
+Discards one or more device memory handles. The handle(s) cannot be subsequently
 used.
 
-'handle' is the id returned from the Op that produced the on-device allocation.
+'handle' is the ID (or a vector of IDs) returned from the Op that produced the
+on-device allocation.
 )");
 
 REGISTER_OP("XRTReleaseAllAllocations")
diff --git a/tensorflow/compiler/xrt/tests/raw_api_test.cc b/tensorflow/compiler/xrt/tests/raw_api_test.cc
index 5f8121703e108f26b048feb7a0412a282f52892c..be0c4b9392258ac3bb0066d0e704ad403cdc211b 100644
--- a/tensorflow/compiler/xrt/tests/raw_api_test.cc
+++ b/tensorflow/compiler/xrt/tests/raw_api_test.cc
@@ -258,8 +258,102 @@ TEST(RawApiTest, AllocAndRewrite) {
   EXPECT_TRUE(new_response.ParseFromString(outputs[0].scalar<string>()()));
   EXPECT_TRUE(CompareLiteralProtos(new_literal, new_response));
 
-  auto release =
-      ops::XRTReleaseAllocationHandle(root, Input(allocation_handle));
+  Tensor release_tensor(DT_INT64, TensorShape({1}));
+  release_tensor.flat<int64>()(0) = allocation_handle;
+
+  auto release = ops::XRTReleaseAllocationHandle(root, release_tensor);
+  TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {}, {release},
+                           &outputs));
+}
+
+TEST(RawApiTest, AllocReleaseMany) {
+  xrt::XLAAllocation alloc1;
+  *alloc1.mutable_value() =
+      xla::LiteralUtil::CreateR2({{4, 5}, {6, 7}}).ToProto();
+  xrt::XLAAllocation alloc2;
+  *alloc2.mutable_value() =
+      xla::LiteralUtil::CreateR2({{6, 7}, {4, 5}}).ToProto();
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  auto value1 =
+      ops::Const(root.WithDevice("/device:CPU:0"), alloc1.SerializeAsString());
+  auto value2 =
+      ops::Const(root.WithDevice("/device:CPU:0"), alloc2.SerializeAsString());
+  auto handle1 = ops::XRTAllocate(root, value1);
+  auto handle2 = ops::XRTAllocate(root, value2);
+  TF_ASSERT_OK(root.status());
+
+  tensorflow::ClientSession session(root);
+  std::vector<tensorflow::Tensor> outputs;
+  TF_EXPECT_OK(session.Run({handle1, handle2}, &outputs));
+  EXPECT_EQ(outputs.size(), 2);
+
+  int64 allocation_handle1 = outputs[0].scalar<int64>()();
+  int64 allocation_handle2 = outputs[1].scalar<int64>()();
+
+  Tensor release_tensor(DT_INT64, TensorShape({2}));
+  release_tensor.flat<int64>()(0) = allocation_handle1;
+  release_tensor.flat<int64>()(1) = allocation_handle2;
+
+  auto release = ops::XRTReleaseAllocationHandle(root, release_tensor);
+  outputs.clear();
+  TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {}, {release},
+                           &outputs));
+}
+
+TEST(RawApiTest, CompileAndReleaseMany) {
+  xrt::XLAComputation c1;
+  auto config1 = c1.mutable_config();
+  auto shapes1 = config1->mutable_program_shape();
+  *shapes1->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes1->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes1->mutable_result() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  StoreComputationSnapshot(AddAndScale(), c1.mutable_hlo_snapshot());
+
+  xrt::XLAComputation c2;
+  auto config2 = c2.mutable_config();
+  auto shapes2 = config2->mutable_program_shape();
+  *shapes2->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes2->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes2->mutable_result() =
+      xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::F32, {2})})
+          .ToProto();
+  StoreComputationSnapshot(AddAndTuple(), c2.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(false);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  auto e_config =
+      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
+  auto computation1 =
+      ops::Const(root.WithDevice("/device:CPU:0"), c1.SerializeAsString());
+  auto c_handle1 = ops::XRTCompile(root, computation1);
+  auto computation2 =
+      ops::Const(root.WithDevice("/device:CPU:0"), c2.SerializeAsString());
+  auto c_handle2 = ops::XRTCompile(root, computation2);
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({c_handle1.handle, c_handle2.handle}, &outputs));
+  EXPECT_EQ(outputs.size(), 2);
+
+  int64 compilation_handle1 = outputs[0].scalar<int64>()();
+  int64 compilation_handle2 = outputs[1].scalar<int64>()();
+
+  Tensor release_tensor(DT_INT64, TensorShape({2}));
+  release_tensor.flat<int64>()(0) = compilation_handle1;
+  release_tensor.flat<int64>()(1) = compilation_handle2;
+
+  auto release = ops::XRTReleaseCompilationHandle(root, release_tensor);
+  outputs.clear();
   TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {}, {release},
                            &outputs));
 }
@@ -862,6 +956,7 @@ TEST(RawApiTest, CompileAndExecuteWithS64Argument) {
   xrt::XRTExecutionConfig e;
   e.set_release_input_handles(true);
   e.set_release_compilation_handle(true);
+  e.set_return_exploded_tuple(true);
 
   Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
   auto e_config =
diff --git a/tensorflow/compiler/xrt/xrt_compilation_cache.cc b/tensorflow/compiler/xrt/xrt_compilation_cache.cc
index d1405eae468492748ae88d842334a922dce272c6..8bf0f28d2233d9e7593365bc42187e327a1c4ac4 100644
--- a/tensorflow/compiler/xrt/xrt_compilation_cache.cc
+++ b/tensorflow/compiler/xrt/xrt_compilation_cache.cc
@@ -273,6 +273,8 @@ Status XRTCompilationCache::Lookup(
   return Status::OK();
 }
 
-string XRTCompilationCache::DebugString() { return "XRTCompilationCache"; }
+string XRTCompilationCache::DebugString() const {
+  return "XRTCompilationCache";
+}
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/xrt_compilation_cache.h b/tensorflow/compiler/xrt/xrt_compilation_cache.h
index c43d0fc47873abdc82ee937c155bebc346a05f17..7398e847d8b744f947adb03e1bcfd5c0a5b2cc55 100644
--- a/tensorflow/compiler/xrt/xrt_compilation_cache.h
+++ b/tensorflow/compiler/xrt/xrt_compilation_cache.h
@@ -118,7 +118,7 @@ class XRTCompilationCache : public ResourceBase {
   // EntryRef holding the program is returned in entry.
   Status Lookup(int64 uid, std::unique_ptr<XRTCompilationCacheEntryRef>* entry);
 
-  string DebugString() override;
+  string DebugString() const override;
 
  private:
   // An entry in the compilation cache. The entry is deleted once it has been
diff --git a/tensorflow/compiler/xrt/xrt_state.h b/tensorflow/compiler/xrt/xrt_state.h
index 3e3d5024124e13b87eed6f79596d50cd64325914..4aac37737eaf07dc622fdbfbfbc0775b3e2dafee 100644
--- a/tensorflow/compiler/xrt/xrt_state.h
+++ b/tensorflow/compiler/xrt/xrt_state.h
@@ -172,7 +172,7 @@ class XRTTupleAllocation : public ResourceBase {
   // ownership of the device memory is transferred to the result.
   xla::ShapeTree<xla::MaybeOwningDeviceMemory> ToDeviceMemoryTree(bool release);
 
-  string DebugString() override { return "XLA allocation handle"; }
+  string DebugString() const override { return "XLA allocation handle"; }
 
  private:
   // Creates a new handle with (tuple) shape.
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 832db0f4ab46911e067d17b4a125706c276cf798..307bb6eca35817c666038010192c03c4327d65c9 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -63,7 +63,6 @@ py_library(
         "//tensorflow/contrib/libsvm",
         "//tensorflow/contrib/linear_optimizer:sdca_estimator_py",
         "//tensorflow/contrib/linear_optimizer:sdca_ops_py",
-        "//tensorflow/contrib/lite/python:lite",
         "//tensorflow/contrib/lookup:lookup_py",
         "//tensorflow/contrib/losses:losses_py",
         "//tensorflow/contrib/losses:metric_learning_py",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 4f1a2a5693235183c8f486817b82c8c81fa389ec..af59686120593283e5dadd5c87c04d2ad24ff8a0 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -91,7 +91,6 @@ from tensorflow.contrib import tpu
 from tensorflow.contrib import training
 from tensorflow.contrib import util
 from tensorflow.contrib.eager.python import tfe as eager
-from tensorflow.contrib.lite.python import lite
 from tensorflow.contrib.optimizer_v2 import optimizer_v2_symbols as optimizer_v2
 from tensorflow.contrib.receptive_field import receptive_field_api as receptive_field
 from tensorflow.contrib.recurrent.python import recurrent_api as recurrent
diff --git a/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb b/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb
index 44532cb078f9bd1578172f8a7d8a4b55cd21a7cb..831c613f2c8c9a4fcc2cb9d313077fe79ee96fd7 100644
--- a/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb
+++ b/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb
@@ -186,8 +186,8 @@
         "\n",
         "  def __init__(self):\n",
         "    super(RnnColorbot, self).__init__()\n",
-        "    self.lower_cell = tf.contrib.rnn.LSTMBlockCell(256)\n",
-        "    self.upper_cell = tf.contrib.rnn.LSTMBlockCell(128)\n",
+        "    self.lower_cell = tf.contrib.rnn.LSTMBlockCell(256, dtype=tf.float32)\n",
+        "    self.upper_cell = tf.contrib.rnn.LSTMBlockCell(128, dtype=tf.float32)\n",
         "    self.relu_layer = tf.layers.Dense(3, activation=tf.nn.relu)\n",
         "\n",
         "  def _rnn_layer(self, chars, cell, batch_size, training):\n",
@@ -241,7 +241,7 @@
         "    seq = self._rnn_layer(seq, self.upper_cell, batch_size, training)\n",
         "\n",
         "    # Grab just the end-of-sequence from each output.\n",
-        "    indices = (length - 1, range(batch_size))\n",
+        "    indices = (length - 1, list(range(batch_size)))\n",
         "    indices = tf.stack(indices, 1)\n",
         "    sequence_ends = tf.gather_nd(seq, indices)\n",
         "    return self.relu_layer(sequence_ends)\n",
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_lib.h b/tensorflow/contrib/bigtable/kernels/bigtable_lib.h
index 4652021fecabfa11fa6a8754dc884d89e151b590..e3b4535bac4a01a1277290e0d1ea6d3c7613731c 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_lib.h
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_lib.h
@@ -42,7 +42,7 @@ class BigtableClientResource : public ResourceBase {
     return client_;
   }
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat("BigtableClientResource(project_id: ", project_id_,
                            ", instance_id: ", instance_id_, ")");
   }
@@ -67,7 +67,7 @@ class BigtableTableResource : public ResourceBase {
 
   ::google::cloud::bigtable::noex::Table& table() { return table_; }
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat(
         "BigtableTableResource(client: ", client_->DebugString(),
         ", table: ", table_name_, ")");
diff --git a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
index 3fe71a2ea730cc9b60b2e2088a0d80a08b38d1a9..c04d5578eef0aadfc918c2de734723fe05c5cfee 100644
--- a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
+++ b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h"
 
-#include "google/bigtable/v2/data.pb.h"
+#include "external/com_github_googleapis_googleapis/google/bigtable/v2/data.pb.h"
 #include "google/protobuf/wrappers.pb.h"
 #include "re2/re2.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
index ee052ac60387d8f993e4942dd7dff39e191dd3a4..47d910d42a27db4b857eeb12209dfbb429dd1be2 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
@@ -487,8 +487,8 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     self.assertTrue(frac_below_upper_0 <= 0.98)
     self.assertTrue(frac_below_upper_1 >= 0.92)
     self.assertTrue(frac_below_upper_1 <= 0.98)
-    self.assertTrue(frac_both_below_upper >= 0.92)
-    self.assertTrue(frac_both_below_upper <= 0.98)
+    self.assertTrue(frac_both_below_upper >= 0.91)
+    self.assertTrue(frac_both_below_upper <= 0.99)
 
     train_input_fn, test_input_fn, _ = _quantile_regression_input_fns(
         two_dimension=True)
@@ -516,8 +516,8 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     self.assertTrue(frac_above_lower_0 <= 0.98)
     self.assertTrue(frac_above_lower_1 >= 0.92)
     self.assertTrue(frac_above_lower_1 <= 0.98)
-    self.assertTrue(frac_both_above_lower >= 0.92)
-    self.assertTrue(frac_both_above_lower <= 0.98)
+    self.assertTrue(frac_both_above_lower >= 0.91)
+    self.assertTrue(frac_both_above_lower <= 0.99)
 
 
 class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
@@ -806,8 +806,8 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
     self.assertTrue(frac_below_upper_0 <= 0.98)
     self.assertTrue(frac_below_upper_1 >= 0.92)
     self.assertTrue(frac_below_upper_1 <= 0.98)
-    self.assertTrue(frac_both_below_upper >= 0.92)
-    self.assertTrue(frac_both_below_upper <= 0.98)
+    self.assertTrue(frac_both_below_upper >= 0.91)
+    self.assertTrue(frac_both_below_upper <= 0.99)
 
     train_input_fn, test_input_fn, _ = _quantile_regression_input_fns(
         two_dimension=True)
@@ -835,8 +835,8 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
     self.assertTrue(frac_above_lower_0 <= 0.98)
     self.assertTrue(frac_above_lower_1 >= 0.92)
     self.assertTrue(frac_above_lower_1 <= 0.98)
-    self.assertTrue(frac_both_above_lower >= 0.92)
-    self.assertTrue(frac_both_above_lower <= 0.98)
+    self.assertTrue(frac_both_above_lower >= 0.91)
+    self.assertTrue(frac_both_above_lower <= 0.99)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc b/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
index e446c411a8d5075563b8f8b912b29df310e16c8c..6faf6963011b698a3b233329d87471da7608e44a 100644
--- a/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
@@ -96,7 +96,7 @@ class StatsAccumulatorResource : public boosted_trees::StampedResource {
              TensorShapeUtils::IsScalar(hessian_shape));
   }
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat("StatsAccumulatorResource[size=", values_.size(),
                            "]");
   }
diff --git a/tensorflow/contrib/boosted_trees/python/ops/model_ops.py b/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
index fca22c71a83459cb290eaebcf107cf1c14c222b7..c3685b54e201f73039f6623443c67ba2b217a51e 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
@@ -62,8 +62,8 @@ class TreeEnsembleVariableSavable(saver.BaseSaverBuilder.SaveableObject):
         saver.BaseSaverBuilder.SaveSpec(ensemble_config, slice_spec,
                                         name + "_config"),
     ]
-    super(TreeEnsembleVariableSavable,
-          self).__init__(tree_ensemble_handle, specs, name)
+    super(TreeEnsembleVariableSavable, self).__init__(tree_ensemble_handle,
+                                                      specs, name)
     self._tree_ensemble_handle = tree_ensemble_handle
     self._create_op = create_op
 
@@ -115,7 +115,7 @@ class TreeEnsembleVariable(tracking.TrackableResource):
 
   def _gather_saveables_for_checkpoint(self):
     return {
-        "tree_ensemble_variable":
+        self.resource_handle.op.name + "/tree_ensemble_variable":
             functools.partial(
                 TreeEnsembleVariableSavable,
                 tree_ensemble_handle=self.resource_handle,
@@ -131,8 +131,8 @@ def tree_ensemble_variable(stamp_token,
 
   Args:
     stamp_token: The initial stamp token value for the ensemble resource.
-    tree_ensemble_config: A `Tensor` of type `string`.
-      Serialized proto of the tree ensemble.
+    tree_ensemble_config: A `Tensor` of type `string`. Serialized proto of the
+      tree ensemble.
     name: A name for the ensemble variable.
     container: An optional `string`. Defaults to `""`.
 
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index a5951fb7377d48748f5eb578c034176517df7749..e78ec476ab3b43e5eb56a2502008bb8020ae97e0 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -566,9 +566,10 @@ class GradientBoostedDecisionTreeModel(object):
     # Determine if ensemble is colocated with the inputs.
     if self._ensemble_handle.device != input_deps[0].device:
       # Create a local ensemble and get its local stamp.
-      with ops.name_scope("local_ensemble", "TreeEnsembleVariable") as name:
+      with ops.name_scope("local_ensemble", "TreeEnsembleVariable"):
         local_ensemble_handle = (
-            gen_model_ops.decision_tree_ensemble_resource_handle_op(name=name))
+            gen_model_ops.decision_tree_ensemble_resource_handle_op(
+                self._ensemble_handle.op.name + "/local_ensemble"))
         create_op = gen_model_ops.create_tree_ensemble_variable(
             local_ensemble_handle, stamp_token=-1, tree_ensemble_config="")
         with ops.control_dependencies([create_op]):
diff --git a/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h b/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h
index 94aeb2c7bb48c6eddb6c7894f8bf6f1567470113..0fe57c0a4e8375cc7ec7aca9553bded87e238b33 100644
--- a/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h
+++ b/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h
@@ -34,7 +34,7 @@ class DecisionTreeEnsembleResource : public StampedResource {
             protobuf::Arena::CreateMessage<
                 boosted_trees::trees::DecisionTreeEnsembleConfig>(&arena_)) {}
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat("GTFlowDecisionTreeEnsemble[size=",
                            decision_tree_ensemble_->trees_size(), "]");
   }
diff --git a/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h b/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h
index fdaaae7f472c8f564ab45a8366d3746cbf1158ee..574e3065e7f46049815897ef73e44d33f0d23f0f 100644
--- a/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h
+++ b/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h
@@ -43,7 +43,7 @@ class QuantileStreamResource : public StampedResource {
     set_stamp(stamp_token);
   }
 
-  string DebugString() override { return "QuantileStreamResource"; }
+  string DebugString() const override { return "QuantileStreamResource"; }
 
   tensorflow::mutex* mutex() { return &mu_; }
 
diff --git a/tensorflow/contrib/checkpoint/python/BUILD b/tensorflow/contrib/checkpoint/python/BUILD
index ada41687261ab63286933d01da4e286173042e0c..4e529322c7c76797938468b405cd175609dc0a73 100644
--- a/tensorflow/contrib/checkpoint/python/BUILD
+++ b/tensorflow/contrib/checkpoint/python/BUILD
@@ -2,7 +2,7 @@ licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = ["//tensorflow:internal"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 py_library(
     name = "checkpoint",
@@ -27,17 +27,17 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "containers_test",
     srcs = ["containers_test.py"],
-    deps = [
+    additional_deps = [
         ":containers",
+        "@six_archive//:six",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/training/checkpointable:base",
         "//tensorflow/python/training/checkpointable:util",
-        "@six_archive//:six",
     ],
 )
 
@@ -53,18 +53,18 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "python_state_test",
     srcs = ["python_state_test.py"],
-    deps = [
+    additional_deps = [
         ":python_state",
+        "//third_party/py/numpy",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:session",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/training/checkpointable:util",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -80,10 +80,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "split_dependency_test",
     srcs = ["split_dependency_test.py"],
-    deps = [
+    additional_deps = [
         ":split_dependency",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_test_lib",
@@ -106,10 +106,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "visualize_test",
     srcs = ["visualize_test.py"],
-    deps = [
+    additional_deps = [
         ":visualize",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:resource_variable_ops",
diff --git a/tensorflow/contrib/cloud/kernels/BUILD b/tensorflow/contrib/cloud/kernels/BUILD
index 1311063ec023bdaa2588d6f1c826bf900f7dea09..20f8c2b2453a58fdbe5a3587fa6687debd9c06d3 100644
--- a/tensorflow/contrib/cloud/kernels/BUILD
+++ b/tensorflow/contrib/cloud/kernels/BUILD
@@ -27,7 +27,6 @@ tf_kernel_library(
     deps = [
         ":bigquery_table_accessor",
         ":bigquery_table_partition_proto_cc",
-        "//tensorflow/contrib/cloud:bigquery_reader_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:reader_base",
@@ -79,7 +78,6 @@ tf_kernel_library(
     srcs = ["gcs_config_ops.cc"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/contrib/cloud:gcs_config_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform/cloud:curl_http_request",
diff --git a/tensorflow/contrib/cmake/external/abseil_cpp.cmake b/tensorflow/contrib/cmake/external/abseil_cpp.cmake
index 46a193971c5084523d432065f265fa7a9909f595..6c6a5df7f76723800740a81ccdcb137a0ec33846 100644
--- a/tensorflow/contrib/cmake/external/abseil_cpp.cmake
+++ b/tensorflow/contrib/cmake/external/abseil_cpp.cmake
@@ -31,17 +31,17 @@ if (systemlib_ABSEIL_CPP)
   message(STATUS "  abseil_cpp includes: ${ABSEIL_CPP_INCLUDE_DIR}")
   message(STATUS "  abseil_cpp libraries: ${ABSEIL_CPP_LIBRARIES}")
 
-  add_custom_target(abseil_cpp)
-  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp)
+  add_custom_target(abseil_cpp_build)
+  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp_build)
 
 else (systemlib_ABSEIL_CPP)
 
   include (ExternalProject)
 
-  set(abseil_cpp_INCLUDE_DIR ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp)
-  set(abseil_cpp_URL https://github.com/abseil/abseil-cpp/archive/e01d95528ea2137a4a27a88d1f57c6cb260aafed.tar.gz)
-  set(abseil_cpp_HASH SHA256=84043ed402d2a2a6ba4cdddb7e85118b1158fd81fe4ac3a14adc343d054c1e2e)
-  set(abseil_cpp_BUILD ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp-build)
+  set(abseil_cpp_INCLUDE_DIR ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp_build)
+  set(abseil_cpp_URL https://github.com/abseil/abseil-cpp.git)
+  set(abseil_cpp_TAG master)
+  set(abseil_cpp_BUILD ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp_build)
 
   if(WIN32)
     if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
@@ -49,8 +49,11 @@ else (systemlib_ABSEIL_CPP)
           ${abseil_cpp_BUILD}/absl/base/Release/absl_base.lib
           ${abseil_cpp_BUILD}/absl/base/Release/absl_dynamic_annotations.lib
           ${abseil_cpp_BUILD}/absl/base/Release/absl_internal_malloc_internal.lib
+          ${abseil_cpp_BUILD}/absl/base/Release/absl_internal_throw_delegate.lib
+          ${abseil_cpp_BUILD}/absl/numeric/Release/absl_int128.lib
           ${abseil_cpp_BUILD}/absl/strings/Release/absl_strings.lib
           ${abseil_cpp_BUILD}/absl/strings/Release/str_format_internal.lib
+          ${abseil_cpp_BUILD}/absl/time/Release/absl_time.lib
           ${abseil_cpp_BUILD}/absl/types/Release/absl_bad_optional_access.lib)
     else()
       set(abseil_cpp_STATIC_LIBRARIES
@@ -62,6 +65,7 @@ else (systemlib_ABSEIL_CPP)
           ${abseil_cpp_BUILD}/absl/numeric/absl_int128.lib
           ${abseil_cpp_BUILD}/absl/strings/absl_strings.lib
           ${abseil_cpp_BUILD}/absl/strings/str_format_internal.lib
+          ${abseil_cpp_BUILD}/absl/time/absl_time.lib
           ${abseil_cpp_BUILD}/absl/types/absl_bad_optional_access.lib)
     endif()
   else()
@@ -74,15 +78,18 @@ else (systemlib_ABSEIL_CPP)
         ${abseil_cpp_BUILD}/absl/numeric/libabsl_int128.a
         ${abseil_cpp_BUILD}/absl/strings/libabsl_strings.a
         ${abseil_cpp_BUILD}/absl/strings/libstr_format_internal.a
+        ${abseil_cpp_BUILD}/absl/time/libabsl_time.a
         ${abseil_cpp_BUILD}/absl/types/libabsl_bad_optional_access.a)
   endif()
 
-  ExternalProject_Add(abseil_cpp
+  ExternalProject_Add(abseil_cpp_build
       PREFIX abseil_cpp
-      URL ${abseil_cpp_URL}
-      URL_HASH ${abseil_cpp_HASH}
+      GIT_REPOSITORY ${abseil_cpp_URL}
       DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+      BUILD_IN_SOURCE 1
       BUILD_BYPRODUCTS ${abseil_cpp_STATIC_LIBRARIES}
+      BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release
+      COMMAND ${CMAKE_COMMAND} --build . --config Release
       INSTALL_COMMAND ""
       CMAKE_CACHE_ARGS
           -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
@@ -91,8 +98,10 @@ else (systemlib_ABSEIL_CPP)
   )
 
   include_directories(${abseil_cpp_INCLUDE_DIR})
+  message(STATUS ${abseil_cpp_INCLUDE_DIR})
+
   list(APPEND tensorflow_EXTERNAL_LIBRARIES ${abseil_cpp_STATIC_LIBRARIES})
 
-  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp)
+  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp_build)
 
 endif (systemlib_ABSEIL_CPP)
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/external/nsync.cmake b/tensorflow/contrib/cmake/external/nsync.cmake
index 479609458c64f7c7bd7b3ce6b23aceaa3db17f21..b15143bfc1cd787b156c9d6dd724a17730f0f8fb 100644
--- a/tensorflow/contrib/cmake/external/nsync.cmake
+++ b/tensorflow/contrib/cmake/external/nsync.cmake
@@ -16,7 +16,7 @@ include (ExternalProject)
 
 set(nsync_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/nsync/public)
 set(nsync_URL https://github.com/google/nsync)
-set(nsync_TAG 1.20.1)
+set(nsync_TAG 1.20.2)
 set(nsync_BUILD ${CMAKE_CURRENT_BINARY_DIR}/nsync/src/nsync)
 set(nsync_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/nsync/install)
 
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 96160568fa79291a7b391761373e1eaf0f70974e..21ae9a08a6bb8f71e5935ddde2d7bb3ed0cd8bbc 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -1,6 +1,9 @@
 # python_sanity_test.py will complain about invalid or missing entries
 # problematic entries can be commented for temporary whitelisting
 tensorflow
+tensorflow/compiler
+tensorflow/compiler/xla
+tensorflow/compiler/xla/service
 tensorflow/core
 tensorflow/core/example
 tensorflow/core/framework
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index d7b2a1339e047aba0a9424a53a63726805e89721..9ae5831f471414d3b610870f9e5f08265f7cb22a 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -302,8 +302,8 @@ file(GLOB_RECURSE tf_core_framework_srcs
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/session.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/session_factory.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/session_options.cc"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/*.cc"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/*.h"
+    "${tensorflow_source_dir}/tensorflow/core/summary/*.cc"
+    "${tensorflow_source_dir}/tensorflow/core/summary/*.h"
     "${tensorflow_source_dir}/public/*.h"
 )
 
@@ -317,14 +317,14 @@ file(GLOB_RECURSE tf_core_framework_exclude_srcs
     "${tensorflow_source_dir}/tensorflow/core/util/*test*.h"
     "${tensorflow_source_dir}/tensorflow/core/util/*test*.cc"
     "${tensorflow_source_dir}/tensorflow/core/util/*main.cc"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/*test*.cc"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/loader.cc"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/vacuum.cc"
+    "${tensorflow_source_dir}/tensorflow/core/summary/*test*.cc"
+    "${tensorflow_source_dir}/tensorflow/core/summary/loader.cc"
+    "${tensorflow_source_dir}/tensorflow/core/summary/vacuum.cc"
 )
 
 # TODO(jart): Why doesn't this work?
 # set_source_files_properties(
-#     ${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/snapfn.cc
+#     ${tensorflow_source_dir}/tensorflow/core/lib/db/snapfn.cc
 #     PROPERTIES COMPILE_FLAGS -DSQLITE_OMIT_LOAD_EXTENSION)
 
 list(REMOVE_ITEM tf_core_framework_srcs ${tf_core_framework_exclude_srcs})
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 8faccf8d55902e6701ebb4ce534b84705304fd5f..1fe8795ddf00232eba5a60a130e0845a6f6a8e17 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -802,6 +802,7 @@ add_custom_command(
       # tensorflow/__init__.py depends on files generated in this step. So, remove it while
       # this step is running since the files aren't there yet.
       COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+      COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
 
       # Run create_python_api.py to generate API init files.
       COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python "${PY_RUNTIME_ENV}" ${PYTHON_EXECUTABLE}
diff --git a/tensorflow/contrib/compiler/xla.py b/tensorflow/contrib/compiler/xla.py
index f867cd15b67dbd43650d8012b4299845af7200a8..2aa5d77b108d0f000550bd4d57fc4a922e7594e6 100644
--- a/tensorflow/contrib/compiler/xla.py
+++ b/tensorflow/contrib/compiler/xla.py
@@ -34,6 +34,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import function_utils
+from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
@@ -76,7 +77,12 @@ def compile(computation, inputs=None):  # pylint: disable=redefined-builtin
 
       All `Operation`s returned from `computation` will be executed when
       evaluating any of the returned output tensors.
-    inputs: A list of input tensors or `None` (equivalent to an empty list).
+    inputs: A list of inputs or `None` (equivalent to an empty list). Each input
+      can be a nested structure containing values that are convertible to
+      tensors. Note that passing an N-dimension list of compatible values will
+      result in a N-dimention list of scalar tensors rather than a single Rank-N
+      tensors. If you need different behavior, convert part of inputs to tensors
+      with `tf.convert_to_tensor`.
 
   Returns:
     A list of output tensors.
@@ -260,17 +266,10 @@ def _compile_internal(computation, inputs=None):
   if not isinstance(inputs, collections.Sequence):
     raise TypeError('inputs must be a list')
 
+  # Flatten inputs.
+  flat_inputs = nest.flatten(inputs)
   # Converts inputs to Tensors.
-  inputs = [ops.convert_to_tensor(x) for x in inputs]
-  input_arity = len(inputs)
-
-  arg_error = check_function_argument_count(
-      computation, input_arity, infeed_queue=None)
-  if arg_error is not None:
-    raise TypeError(
-        'Supplied computation cannot be called with the specified inputs. You '
-        'specified %d inputs: %s, but the computation needs %s' %
-        (input_arity, str([i.name for i in inputs]), arg_error))
+  flat_inputs = [ops.convert_to_tensor(x) for x in flat_inputs]
 
   cluster_name = ops.get_default_graph().unique_name('cluster')
   pivot = control_flow_ops.no_op(name=cluster_name + '/pivot')
@@ -280,11 +279,15 @@ def _compile_internal(computation, inputs=None):
 
     # Add identity ops so even unused inputs are 'consumed' by the
     # computation.
-    computation_inputs = [
+    flat_inputs = [
         array_ops.identity(x, name='input_{}'.format(i))
-        for i, x in enumerate(inputs)
+        for i, x in enumerate(flat_inputs)
     ]
 
+    # Re-pack flat_inputs in same structure as 'inputs'.
+    computation_inputs = nest.pack_sequence_as(
+        structure=inputs, flat_sequence=flat_inputs)
+
     # Only resource variables work inside an XLA computation, so turn on
     # resource variables for the computation.
     vscope = variable_scope.get_variable_scope()
diff --git a/tensorflow/contrib/distribute/README.md b/tensorflow/contrib/distribute/README.md
index 8a8dc159ade6f2a4a9b5ec29055ea4848492b29f..dbcaf8185fb7a9d2bcf22376439c0ebd49accb1a 100644
--- a/tensorflow/contrib/distribute/README.md
+++ b/tensorflow/contrib/distribute/README.md
@@ -43,28 +43,19 @@ the workers.
 
 Let's see how to scale to multiple GPUs on one machine using `MirroredStrategy` with [tf.keras] (https://www.tensorflow.org/guide/keras).
 
-Take a very simple model consisting of a single layer:
+Let's define a simple input dataset for training this model. Note that currently we require using
+[`tf.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset)
+with `DistributionStrategy`.
 
 ```python
 import tensorflow as tf
 from tensorflow import keras
 
-inputs = tf.keras.layers.Input(shape=(1,))
-predictions = tf.keras.layers.Dense(1)(inputs)
-model = tf.keras.models.Model(inputs=inputs, outputs=predictions)
-```
-
-Let's also define a simple input dataset for training this model. Note that currently we require using
-[`tf.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset)
-with `DistributionStrategy`.
-
-```python
 features = tf.data.Dataset.from_tensors([1.]).repeat(10000).batch(10)
 labels = tf.data.Dataset.from_tensors([1.]).repeat(10000).batch(10)
 train_dataset = tf.data.Dataset.zip((features, labels))
 ```
 
-
 To distribute this Keras model on multiple GPUs using `MirroredStrategy` we
 first instantiate a `MirroredStrategy` object.
 
@@ -72,14 +63,17 @@ first instantiate a `MirroredStrategy` object.
 distribution = tf.contrib.distribute.MirroredStrategy()
 ```
 
-We then compile the Keras model and pass the `MirroredStrategy` object in the
-`distribute` argument (apart from other usual arguments like `loss` and
-`optimizer`).
+Take a very simple model consisting of a single layer. We need to create and compile
+the model under the distribution strategy scope.
 
 ```python
-model.compile(loss='mean_squared_error',
-              optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.2),
-              distribute=distribution)
+with distribution.scope():
+  inputs = tf.keras.layers.Input(shape=(1,))
+  predictions = tf.keras.layers.Dense(1)(inputs)
+  model = tf.keras.models.Model(inputs=inputs, outputs=predictions)
+
+  model.compile(loss='mean_squared_error',
+                optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.2))
 ```
 
 To train the model we call Keras `fit` API using the input dataset that we
diff --git a/tensorflow/contrib/distribute/__init__.py b/tensorflow/contrib/distribute/__init__.py
index 8ec73654e30e4967f318c558ba94301e84a206e4..1bcc453a7e85b4e6737724cb5bdcd153cdd8c8ea 100644
--- a/tensorflow/contrib/distribute/__init__.py
+++ b/tensorflow/contrib/distribute/__init__.py
@@ -30,6 +30,7 @@ from tensorflow.contrib.distribute.python.monitor import Monitor
 from tensorflow.contrib.distribute.python.one_device_strategy import OneDeviceStrategy
 from tensorflow.contrib.distribute.python.parameter_server_strategy import ParameterServerStrategy
 from tensorflow.contrib.distribute.python.step_fn import *
+from tensorflow.contrib.distribute.python.tpu_strategy import initialize_tpu_system
 from tensorflow.contrib.distribute.python.tpu_strategy import TPUStrategy
 from tensorflow.python.distribute.cross_device_ops import *
 from tensorflow.python.distribute.distribute_config import DistributeConfig
@@ -58,11 +59,14 @@ _allowed_symbols = [
     'StandardSingleLossStep',
     'ReplicaContext',
     'TPUStrategy',
+    'initialize_tpu_system',
     'get_cross_replica_context',
     'get_distribution_strategy',
     'get_loss_reduction',
     'get_replica_context',
+    'get_strategy',
     'has_distribution_strategy',
+    'has_strategy',
     'in_cross_replica_context',
     'require_replica_context',
     'run_standard_tensorflow_server',
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 2d6a08df9a266095d3a9619e692ae9b96af93ceb..d4758d7518f4e209d58d559723c67e90473d8a79 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -1,8 +1,8 @@
 # Implementation of a prototype TF distributed computation library.
 
+load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_test")
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
-load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_test")
 
 package(
     default_visibility = [
@@ -23,17 +23,14 @@ cuda_py_test(
     additional_deps = [
         ":combinations",
         ":mirrored_strategy",
-        ":multi_worker_test_base",
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:device_util",
         "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
@@ -45,14 +42,36 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "input_lib_test",
+    srcs = ["input_lib_test.py"],
+    additional_deps = [
+        ":combinations",
+        ":mirrored_strategy",
+        ":multi_worker_test_base",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/distribute:values",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+    ],
+    tags = [
+        "no_pip",
+    ],
+)
+
 py_library(
     name = "mirrored_strategy",
     srcs = ["mirrored_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:input_lib",
         "//tensorflow/python/distribute:mirrored_strategy",
-        "//tensorflow/python/distribute:values",
     ],
 )
 
@@ -61,18 +80,10 @@ py_library(
     srcs = ["parameter_server_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":mirrored_strategy",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:util",
-        "//tensorflow/python/distribute:cross_device_ops",
-        "//tensorflow/python/distribute:multi_worker_util",
-        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:parameter_server_strategy",
         "//tensorflow/python/distribute:values",
-        "//tensorflow/python/eager:context",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
     ],
 )
 
@@ -119,6 +130,8 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/distribute:numpy_dataset",
         "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
@@ -139,7 +152,9 @@ py_library(
         "//tensorflow/python:training",
         "//tensorflow/python/distribute:cross_device_ops",
         "//tensorflow/python/distribute:cross_device_utils",
+        "//tensorflow/python/distribute:input_lib",
         "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:numpy_dataset",
         "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
     ],
@@ -165,6 +180,7 @@ py_library(
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -289,6 +305,8 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/distribute:numpy_dataset",
         "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/distribute:values",
     ],
@@ -660,7 +678,9 @@ cuda_py_test(
     additional_deps = [
         ":keras_correctness_test_lib",
     ],
-    shard_count = 16,
+    # Shard count is set to an odd number to distribute tasks across
+    # shards more evenly.
+    shard_count = 19,
     tags = [
         "multi_and_single_gpu",
         "no_oss",  # TODO(b/117919883): Fix python error.
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
index 12197c3d0dedee23d12732b8d4398f43bfc61caa..eee07543251321ae0c9eef57851431cf97c65643 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
@@ -26,9 +26,12 @@ from tensorflow.python.distribute import cross_device_ops as cross_device_ops_li
 from tensorflow.python.distribute import cross_device_utils
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
+from tensorflow.python.eager import tape
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
@@ -85,9 +88,11 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     else:
       local_devices = ("/device:CPU:0",)
     self._worker_device = device_util.canonicalize("/device:CPU:0")
+    self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)
 
     self._collective_keys = cross_device_utils.CollectiveKeys()
     self._initialize_local(local_devices)
+    # TODO(yuefengz): remove num_gpus_per_worker from CollectiveAllReduce.
     self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
         num_workers=self._num_workers,
         num_gpus_per_worker=num_gpus_per_worker,
@@ -120,6 +125,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
                                                 task_id)
 
     self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
+    self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)
     if num_gpus_per_worker:
       local_devices = tuple(
           "%s/device:GPU:%d" % (self._worker_device, i)
@@ -130,7 +136,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
 
     self._collective_keys = cross_device_utils.CollectiveKeys()
     self._initialize_local(local_devices)
-    self._input_workers = values.InputWorkers(
+    self._input_workers = input_lib.InputWorkers(
         self._device_map, [(self._worker_device, self.worker_devices)])
     self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
         num_workers=self._num_workers,
@@ -156,19 +162,23 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     if colocate_with is None:
       device_map = self._device_map
       logical_device = 0  # TODO(josh11b): Get logical device from scope here.
+    elif isinstance(colocate_with, numpy_dataset.SingleDevice):
+      with ops.device(colocate_with.device):
+        return next_creator(*args, **kwargs)
     else:
       device_map = colocate_with.device_map
       logical_device = colocate_with.logical_device
-    group_size = device_map.num_replicas_in_graph * self._num_workers
-    group_key = self._collective_keys.get_group_key(self.worker_devices)
 
     def _real_mirrored_creator(devices, *args, **kwargs):
       """Creates one MirroredVariable on the current worker."""
-      value_list = []
       unique_var_name = ops.get_default_graph().unique_name(
           kwargs["name"], mark_as_used=False).rstrip("/")
+      # pylint: disable=protected-access
       collective_instance_key = self._collective_keys.get_instance_key(
           key_id=unique_var_name)
+      # Only the first device participles in the broadcast of initial values.
+      group_key = self._collective_keys.get_group_key([devices[0]])
+      group_size = self._num_workers
       if "initial_value" not in kwargs:
         raise ValueError("Initial value must be specified.")
       initial_value = kwargs["initial_value"]
@@ -177,9 +187,33 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
       else:
         initial_value_fn = lambda: initial_value
 
+      value_list = []
       for i, d in enumerate(devices):
-        with ops.device(d):
-          if i > 0:
+        with ops.init_scope(), ops.device(d):
+          if i == 0:
+            # The initial value fn makes sure variables all initialized to
+            # same values. The first device of the chief worker will send their
+            # variable values to other workers.
+            def _overridden_initial_value_fn(device=d, index=i):  # pylint: disable=g-missing-docstring
+              with ops.device(device):
+                initial_value = initial_value_fn()
+                assert not callable(initial_value)
+                initial_value = ops.convert_to_tensor(initial_value)
+
+                assert index == 0, index
+                if self._num_workers > 1:
+                  if self._is_chief:
+                    bcast_send = collective_ops.broadcast_send(
+                        initial_value, initial_value.shape, initial_value.dtype,
+                        group_size, group_key, collective_instance_key)
+                    with ops.control_dependencies([bcast_send]):
+                      return array_ops.identity(initial_value)
+                  else:
+                    return collective_ops.broadcast_recv(
+                        initial_value.shape, initial_value.dtype, group_size,
+                        group_key, collective_instance_key)
+                return initial_value
+          else:
             # Give replicas meaningful distinct names:
             var0name = value_list[0].name.split(":")[0]
             # We append a / to variable names created on replicas with id > 0 to
@@ -187,30 +221,22 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
             # name as the absolute name of the variable.
             kwargs["name"] = "%s/replica_%d/" % (var0name, i)
 
-          # The initial value fn makes sure variables all initialized to
-          # same values. The first device of the chief worker will send their
-          # variable values to other devices and other workers.
-          def _overridden_initial_value_fn(device=d, index=i):  # pylint: disable=g-missing-docstring
-            with ops.device(device):
-              initial_value = initial_value_fn()
-              assert not callable(initial_value)
-              initial_value = ops.convert_to_tensor(initial_value)
-
-              if self._is_chief and index == 0:
-                bcast_send = collective_ops.broadcast_send(
-                    initial_value, initial_value.shape, initial_value.dtype,
-                    group_size, group_key, collective_instance_key)
-                with ops.control_dependencies([bcast_send]):
-                  return array_ops.identity(initial_value)
-              else:
-                return collective_ops.broadcast_recv(
-                    initial_value.shape, initial_value.dtype, group_size,
-                    group_key, collective_instance_key)
+            # Variables on non-first replica get initial values from the
+            # variables created on the first device of each worker.
+            def _overridden_initial_value_fn(device=d, index=i):
+              assert index > 0
+              with ops.device(device):
+                if context.executing_eagerly():
+                  return array_ops.identity(value_list[0].value())
+                else:
+                  return array_ops.identity(value_list[0].initial_value)
 
           kwargs["initial_value"] = _overridden_initial_value_fn
-
           with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
-            v = next_creator(*args, **kwargs)
+            # Don't record operations (e.g. other variable reads) during
+            # variable creation.
+            with tape.stop_recording():
+              v = next_creator(*args, **kwargs)
 
           if i == 0:
             actual_var_name = v.name.split(":")[0]
@@ -229,13 +255,13 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     """Distributes the dataset to each local GPU."""
     # TODO(yuefengz): shard the dataset.
     worker_index = 0
-    return values.PerReplicaDataset(
+    return input_lib.PerReplicaDataset(
         self._call_dataset_fn(dataset_fn), self._input_workers, worker_index,
         prefetch_on_device=True)
 
   def _make_dataset_iterator(self, dataset):
-    return values.DatasetIterator(dataset, self._input_workers,
-                                  self._num_replicas_in_sync)
+    return input_lib.DatasetIterator(dataset, self._input_workers,
+                                     self._num_replicas_in_sync)
 
   def _make_input_fn_iterator(
       self,
@@ -252,7 +278,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
         input_pipeline_id=input_pipeline_id,
         num_replicas_in_sync=self._num_replicas_in_sync)
 
-    return values.InputFunctionIterator(
+    return input_lib.InputFunctionIterator(
         input_fn, self._input_workers, [input_context])
 
   def _configure(self,
@@ -346,4 +372,12 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
   # TODO(priyag): Delete this once all strategies use global batch size.
   @property
   def _global_batch_size(self):
-    return False
+    """`make_dataset_iterator` and `make_numpy_iterator` use global batch size.
+
+    `distribute_dataset` and `make_input_fn_iterator` assume per-replica
+    batching.
+
+    Returns:
+      Boolean.
+    """
+    return True
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
index 0fb672dded7624e798592d2f5c01945aa830021e..62ff4b178ea8a69c603d0e213de9efbada41eddb 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
@@ -192,6 +192,7 @@ class CollectiveAllReduceStrategyTestBase(
       image = random_ops.random_uniform([2, 28, 28])
       label = random_ops.random_uniform([2, 1], maxval=10, dtype=dtypes.int32)
       logits = model(image, training=True)
+      # TODO(yuefengz): make loss a callable for eager mode.
       loss = losses.sparse_softmax_cross_entropy(labels=label, logits=logits)
       optimizer = adam.AdamOptimizer(learning_rate=1e-4)
       train_op = optimizer.minimize(loss,
@@ -397,28 +398,38 @@ class DistributedCollectiveAllReduceStrategyTestWithChief(
         self._test_complex_model, self._cluster_spec, num_gpus=num_gpus)
 
 
-class LocalCollectiveAllReduceStrategy(CollectiveAllReduceStrategyTestBase,
-                                       strategy_test_lib.DistributionTestBase,
-                                       parameterized.TestCase):
+class LocalCollectiveAllReduceStrategy(
+    CollectiveAllReduceStrategyTestBase,
+    strategy_test_lib.DistributionTestBase,
+    strategy_test_lib.TwoDeviceDistributionTestBase,
+    parameterized.TestCase):
 
-  def testMinimizeLossGraph(self, num_gpus=2):
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph', 'eager'], num_gpus=[2, 4], required_gpus=2))
+  def testMinimizeLoss(self, num_gpus):
     # Collective ops doesn't support strategy with one device.
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
-    self._test_minimize_loss_graph(None, None, num_gpus)
+    if context.executing_eagerly():
+      strategy, _, _ = self._get_test_object(None, None, num_gpus)
+      self._test_minimize_loss_eager(strategy)
+    else:
+      self._test_minimize_loss_graph(None, None, num_gpus)
 
-  def testComplexModel(self, num_gpus=2):
-    # Collective ops doesn't support strategy with one device.
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[2, 4], required_gpus=2))
+  def testComplexModel(self, num_gpus):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
     self._test_complex_model(None, None, num_gpus)
 
-  def testMakeInputFnIterator(self, num_gpus=2):
-    # Collective ops doesn't support strategy with one device.
-    if context.num_gpus() < num_gpus:
-      self.skipTest('Not enough GPUs')
-    dataset_fn = lambda: dataset_ops.Dataset.range(10)
-    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+  @combinations.generate(
+      combinations.combine(mode=['graph', 'eager'], required_gpus=2))
+  def testMakeInputFnIterator(self):
+    num_gpus = 2
+    dataset_fn = lambda: dataset_ops.Dataset.range(5 * num_gpus)
+    expected_values = [range(i, i + num_gpus) for i in range(0, 10, num_gpus)]
 
     input_fn = self._input_fn_to_test_input_context(
         dataset_fn,
@@ -428,6 +439,49 @@ class LocalCollectiveAllReduceStrategy(CollectiveAllReduceStrategyTestBase,
     self._test_input_fn_iterator(None, None, num_gpus,
                                  input_fn, expected_values)
 
+  def testAllReduceSum(self):
+    if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
+    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
+    with self.cached_session(config=config, target=target):
+      self._test_all_reduce_sum(distribution)
+
+  def testAllReduceSumGradients(self):
+    if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
+    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
+    with self.cached_session(config=config, target=target):
+      self._test_all_reduce_sum_gradients(distribution)
+
+  def testAllReduceSumGradientTape(self):
+    if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
+    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
+    with self.cached_session(config=config, target=target):
+      self._test_all_reduce_sum_gradient_tape(distribution)
+
+  def testAllReduceMean(self):
+    if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
+    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
+    with self.cached_session(config=config, target=target):
+      self._test_all_reduce_mean(distribution)
+
+  def testAllReduceMeanGradients(self):
+    if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
+    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
+    with self.cached_session(config=config, target=target):
+      self._test_all_reduce_mean_gradients(distribution)
+
+  def testAllReduceMeanGradientTape(self):
+    if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
+    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
+    with self.cached_session(config=config, target=target):
+      self._test_all_reduce_mean_gradient_tape(distribution)
+
+  def testNumpyIterator(self):
+    num_gpus = 2
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    strategy, _, _ = self._get_test_object(None, None, num_gpus)
+    self._test_numpy_iterator(strategy)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
index 4a934953ad2d4c6ecbe2bde2333a49bf8fd72821..f6c4291659b493b59b102b7ca83454f804898772 100644
--- a/tensorflow/contrib/distribute/python/combinations.py
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -46,7 +46,7 @@ import unittest
 from absl.testing import parameterized
 import six
 
-from tensorflow.contrib.cluster_resolver import TPUClusterResolver
+from tensorflow.contrib import cluster_resolver
 from tensorflow.contrib.distribute.python import mirrored_strategy as mirrored_lib
 from tensorflow.contrib.distribute.python import one_device_strategy as one_device_lib
 from tensorflow.contrib.distribute.python import tpu_strategy as tpu_lib
@@ -321,6 +321,15 @@ class NamedDistribution(object):
     return self._required_tpu
 
 
+def _get_tpu_strategy_creator(steps_per_run):
+  def _create_tpu_strategy():
+    resolver = cluster_resolver.TPUClusterResolver("")
+    tpu_lib.initialize_tpu_system(resolver)
+    strategy = tpu_lib.TPUStrategy(resolver, steps_per_run=steps_per_run)
+    return strategy
+  return _create_tpu_strategy
+
+
 # pylint: disable=g-long-lambda
 default_strategy = NamedDistribution(
     "Default",
@@ -330,13 +339,12 @@ one_device_strategy = NamedDistribution(
     "OneDeviceCPU", lambda: one_device_lib.OneDeviceStrategy("/cpu:0"),
     required_gpus=None)
 tpu_strategy = NamedDistribution(
-    "TPU", lambda: tpu_lib.TPUStrategy(
-        TPUClusterResolver(""), steps_per_run=2),
+    "TPU", _get_tpu_strategy_creator(steps_per_run=2),
     required_tpu=True)
 tpu_strategy_one_step = NamedDistribution(
-    "TPUOneStep", lambda: tpu_lib.TPUStrategy(
-        TPUClusterResolver(""), steps_per_run=1),
+    "TPUOneStep", _get_tpu_strategy_creator(steps_per_run=1),
     required_tpu=True)
+
 mirrored_strategy_with_one_cpu = NamedDistribution(
     "Mirrored1CPU",
     lambda: mirrored_lib.MirroredStrategy(["/cpu:0"]))
diff --git a/tensorflow/contrib/distribute/python/examples/keras_mnist.py b/tensorflow/contrib/distribute/python/examples/keras_mnist.py
index 60fda996642464135fe1fb8c314bcf7f04d19362..1ce91ecaf22a80a53124c8f00fac05c6b4711ed9 100644
--- a/tensorflow/contrib/distribute/python/examples/keras_mnist.py
+++ b/tensorflow/contrib/distribute/python/examples/keras_mnist.py
@@ -109,22 +109,21 @@ def main(_):
   tf.enable_eager_execution()
 
   train_ds, eval_ds, input_shape = get_input_datasets()
-  model = get_model(input_shape)
 
   # Instantiate the MirroredStrategy object. If we don't specify `num_gpus` or
   # the `devices` argument then all the GPUs available on the machine are used.
   # TODO(priyag): Use `tf.distribute.MirroredStrategy` once available.
   strategy = mirrored_strategy.MirroredStrategy(['/gpu:0', '/cpu:0'])
 
-  optimizer = rmsprop.RMSProp(learning_rate=0.001)
-
-  # Compile the model by passing the distribution strategy object to the
-  # `distribute` argument. `fit`, `evaluate` and `predict` will be distributed
-  # based on the strategy instantiated.
-  model.compile(loss=tf.keras.losses.categorical_crossentropy,
-                optimizer=optimizer,
-                metrics=['accuracy'],
-                distribute=strategy)
+  # Create and compile the model under Distribution strategy scope.
+  # `fit`, `evaluate` and `predict` will be distributed based on the strategy
+  # model was compiled with.
+  with strategy.scope():
+    model = get_model(input_shape)
+    optimizer = rmsprop.RMSProp(learning_rate=0.001)
+    model.compile(loss=tf.keras.losses.categorical_crossentropy,
+                  optimizer=optimizer,
+                  metrics=['accuracy'])
 
   # Train the model with the train dataset.
   model.fit(x=train_ds, epochs=20, steps_per_epoch=468)
diff --git a/tensorflow/contrib/distribute/python/input_lib_test.py b/tensorflow/contrib/distribute/python/input_lib_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f589cd6ad54ea8f33002cb067ef8d83d3d33036a
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/input_lib_test.py
@@ -0,0 +1,480 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the input_lib library."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import multi_worker_test_base
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
+from tensorflow.python.distribute import values
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.util import nest
+
+
+class PerReplicaDatasetTest(test.TestCase):
+
+  config = config_pb2.ConfigProto()
+  config.allow_soft_placement = True
+
+  def _test_iterator(self, devices, dataset, expected_values):
+    device_map = values.ReplicaDeviceMap(devices)
+    input_workers = input_lib.InputWorkers(device_map)
+    per_replica_dataset = input_lib.PerReplicaDataset(dataset, input_workers, 0)
+    if context.executing_eagerly():
+      iterator = per_replica_dataset.make_one_shot_iterator()
+    else:
+      iterator = per_replica_dataset.make_initializable_iterator()
+      self.evaluate([iterator.initializer])
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next_as_list()
+      computed_value = self.evaluate(next_element)
+      self.assertEqual(expected_value, computed_value)
+
+    with self.assertRaises(errors.OutOfRangeError):
+      next_element = iterator.get_next_as_list()
+      self.evaluate(next_element)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testOneDevice(self):
+    devices = ["/device:CPU:0"]
+    dataset = dataset_ops.Dataset.range(10)
+
+    expected_values = [[i] for i in range(10)]
+
+    self._test_iterator(devices, dataset, expected_values)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testMultipleDevices(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    devices = ["/device:CPU:0", "/device:GPU:0"]
+    dataset = dataset_ops.Dataset.range(10)
+
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+
+    self._test_iterator(devices, dataset, expected_values)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testTupleDataset(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    devices = ["/device:CPU:0", "/device:GPU:0"]
+    dataset1 = dataset_ops.Dataset.range(10)
+    dataset2 = dataset_ops.Dataset.range(10).map(lambda x: x**2)
+    dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
+
+    expected_values = [[(i, i**2), (i+1, (i+1)**2)] for i in range(0, 10, 2)]
+
+    self._test_iterator(devices, dataset, expected_values)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testUnevenDatasetBatches(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    devices = ["/device:CPU:0", "/device:GPU:0"]
+    dataset = dataset_ops.Dataset.range(11)
+
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+    self._test_iterator(devices, dataset, expected_values)
+
+  def testInitializableIterator(self):
+    with context.graph_mode():
+      devices = ["/device:CPU:0"]
+      # Using random input since that is only allowed with initializable
+      # iterator.
+      dataset = dataset_ops.Dataset.from_tensor_slices(
+          random_ops.random_uniform((10,)))
+
+      device_map = values.ReplicaDeviceMap(devices)
+      input_workers = input_lib.InputWorkers(device_map)
+      per_replica_dataset = input_lib.PerReplicaDataset(
+          dataset, input_workers, 0)
+      iterator = per_replica_dataset.make_initializable_iterator()
+
+      self.evaluate(iterator.initializer)
+      next_element = iterator.get_next_as_list()
+      for _ in range(10):
+        self.evaluate(next_element)
+
+      # Should fail after the input is finished.
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(next_element)
+
+      # After re-initializing the iterator, should be able to iterate again.
+      self.evaluate(iterator.initializer)
+      for _ in range(10):
+        self.evaluate(next_element)
+
+
+class MultiWorkerDatasetTest(multi_worker_test_base.MultiWorkerTestBase):
+
+  def _test_iterator(self, sess, iterator, devices, expected_values):
+    next_element = iterator.get_next()
+    for r, device in enumerate(devices):
+      v = values.select_replica(r, next_element)
+      # The `v` here can be a tuple.
+      for element in nest.flatten(v):
+        self.assertTrue(element.device in device)
+
+    for expected_value in expected_values:
+      t = [values.select_replica(r, next_element) for r in range(len(devices))]
+      actual = sess.run(t)
+      self.assertEqual(expected_value, actual)
+
+    with self.assertRaises(errors.OutOfRangeError):
+      sess.run([values.select_replica(r, next_element)
+                for r in range(len(devices))])
+
+  def _test_dataset(self, dataset_fn, worker_devices, devices,
+                    expected_values):
+    device_map = values.ReplicaDeviceMap(devices)
+    input_workers = input_lib.InputWorkers(device_map, worker_devices)
+    multi_worker_dataset = input_lib.MultiWorkerDataset(
+        dataset_fn, input_workers)
+    multi_worker_iterator = multi_worker_dataset.make_initializable_iterator()
+    with self.cached_session() as sess:
+      sess.run(multi_worker_iterator.initializer)
+      self._test_iterator(sess, multi_worker_iterator, devices, expected_values)
+
+  def _cpu_devices(self):
+    worker_devices = (
+        ("/job:worker/replica:0/task:0",
+         ["/job:worker/replica:0/task:0/device:CPU:0"]),
+        ("/job:worker/replica:0/task:1",
+         ["/job:worker/replica:0/task:1/device:CPU:0"])
+    )
+    devices = [
+        "/job:worker/replica:0/task:0/device:CPU:0",
+        "/job:worker/replica:0/task:1/device:CPU:0"
+    ]
+    return worker_devices, devices
+
+  def _cpu_and_one_gpu_devices(self):
+    worker_devices = (
+        ("/job:worker/replica:0/task:0", (
+            "/job:worker/replica:0/task:0/device:GPU:0",
+            "/job:worker/replica:0/task:0/device:CPU:0"
+        )),
+        ("/job:worker/replica:0/task:1", (
+            "/job:worker/replica:0/task:1/device:GPU:0",
+            "/job:worker/replica:0/task:1/device:CPU:0"
+        ))
+    )
+    devices = [
+        "/job:worker/replica:0/task:0/device:GPU:0",
+        "/job:worker/replica:0/task:0/device:CPU:0",
+        "/job:worker/replica:0/task:1/device:GPU:0",
+        "/job:worker/replica:0/task:1/device:CPU:0"
+    ]
+    return worker_devices, devices
+
+  def testDataDistributionOneDevicePerWorker(self):
+    worker_devices, devices = self._cpu_devices()
+    with context.graph_mode():
+      dataset_fn = lambda: dataset_ops.Dataset.range(8)
+      self._test_dataset(
+          dataset_fn, worker_devices, devices,
+          [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]])
+
+  def testDataDistributionTwoDevicePerWorker(self):
+    if context.num_gpus() < 1:
+      self.skipTest("A GPU is not available for this test.")
+    worker_devices, devices = self._cpu_and_one_gpu_devices()
+    with context.graph_mode():
+      dataset_fn = lambda: dataset_ops.Dataset.range(8)
+      self._test_dataset(
+          dataset_fn, worker_devices, devices,
+          [[0, 1, 0, 1], [2, 3, 2, 3], [4, 5, 4, 5], [6, 7, 6, 7]])
+
+  def testTupleDataset(self):
+    worker_devices, devices = self._cpu_devices()
+
+    with context.graph_mode():
+
+      def dataset_fn():
+        dataset1 = dataset_ops.Dataset.range(8)
+        dataset2 = dataset_ops.Dataset.range(8).map(lambda x: x**2)
+        return dataset_ops.Dataset.zip((dataset1, dataset2))
+
+      expected_values = [[(i, i**2), (i, i**2)] for i in range(8)]
+      self._test_dataset(dataset_fn, worker_devices, devices,
+                         expected_values)
+
+  def testInitializableIterator(self):
+    worker_devices, devices = self._cpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
+      dataset_fn = lambda: dataset_ops.Dataset.range(8)
+      device_map = values.ReplicaDeviceMap(devices)
+      input_workers = input_lib.InputWorkers(device_map, worker_devices)
+      multi_worker_dataset = input_lib.MultiWorkerDataset(
+          dataset_fn, input_workers)
+      multi_worker_iterator = multi_worker_dataset.make_initializable_iterator()
+
+      sess.run(multi_worker_iterator.initializer)
+      self._test_iterator(
+          sess, multi_worker_iterator, devices,
+          [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]])
+
+      # After re-initializing the iterator, should be able to iterate again.
+      sess.run(multi_worker_iterator.initializer)
+      self._test_iterator(
+          sess, multi_worker_iterator, devices,
+          [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]])
+
+  def testValueErrorForIterator(self):
+    # Incompatiable arguments.
+    d1 = "/device:GPU:0"
+    d2 = "/device:GPU:1"
+    device_map = values.ReplicaDeviceMap([d1, d2])
+    input_workers = input_lib.InputWorkers(
+        device_map, (("w1", (d1,)), ("w2", (d2,))))
+    with self.assertRaises(ValueError):
+      input_lib.MultiWorkerDataIterator([("w1", None)], input_workers)
+
+  def testDuplicateDevices(self):
+    _, devices = self._cpu_devices()
+    devices.append("/job:worker/replica:0/task:0/device:CPU:0")
+    with self.assertRaises(ValueError):
+      _ = values.ReplicaDeviceMap(devices)
+
+
+class InputIteratorTestBase(test.TestCase):
+
+  def _test_iterator(self, input_type, dataset_fn, worker_device_pairs,
+                     expected_values, sess=None, split_batch_by=None):
+    devices = nest.flatten([ds for _, ds in worker_device_pairs])
+    device_map = values.ReplicaDeviceMap(devices)
+    input_workers = input_lib.InputWorkers(device_map, worker_device_pairs)
+
+    if input_type == "input_fn":
+      input_contexts = [
+          distribute_lib.InputContext() for _ in worker_device_pairs]
+      input_fn = lambda _: dataset_fn()
+      iterator = input_lib.InputFunctionIterator(
+          input_fn, input_workers, input_contexts)
+    else:
+      iterator = input_lib.DatasetIterator(
+          dataset_fn(), input_workers, split_batch_by)
+
+    evaluate = lambda x: sess.run(x) if sess else self.evaluate(x)
+
+    evaluate(control_flow_ops.group(iterator.initialize()))
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next()
+      computed_value = evaluate(
+          [values.select_replica(r, next_element) for r in range(len(devices))])
+      self.assertAllEqual(expected_value, computed_value)
+
+    with self.assertRaises(errors.OutOfRangeError):
+      next_element = iterator.get_next()
+      evaluate([values.select_replica(r, next_element)
+                for r in range(len(devices))])
+
+    # After re-initializing the iterator, should be able to iterate again.
+    evaluate(control_flow_ops.group(iterator.initialize()))
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next()
+      computed_value = evaluate(
+          [values.select_replica(r, next_element) for r in range(len(devices))])
+      self.assertAllEqual(expected_value, computed_value)
+
+
+class InputIteratorSingleWorkerTest(InputIteratorTestBase,
+                                    parameterized.TestCase):
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"]))
+  def testOneDeviceCPU(self, input_type):
+    worker_device_pairs = [("", ["/device:CPU:0"])]
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+
+    expected_values = [[i] for i in range(10)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testTwoDevicesOneGPUOneCPU(self, input_type):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testTupleDataset(self, input_type):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    def dataset_fn():
+      dataset1 = dataset_ops.Dataset.range(10)
+      dataset2 = dataset_ops.Dataset.range(10).map(lambda x: x**2)
+      return dataset_ops.Dataset.zip((dataset1, dataset2))
+
+    expected_values = [[(i, i**2), (i+1, (i+1)**2)] for i in range(0, 10, 2)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testUnevenDatasetBatches(self, input_type):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    dataset_fn = lambda: dataset_ops.Dataset.range(11)
+
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["dataset"],
+      split_batch_by=[None, 2],
+      required_gpus=1))
+  def testBatchSplitting(self, input_type, split_batch_by):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    batch_size = 10
+    dataset_fn = lambda: dataset_ops.Dataset.range(100).batch(batch_size)
+
+    updated_batch_size = (
+        batch_size // split_batch_by if split_batch_by else batch_size)
+    expected_values = [[range(i, i+updated_batch_size),
+                        range(i+updated_batch_size, i+2*updated_batch_size)]
+                       for i in range(0, 100, updated_batch_size*2)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values, sess=None,
+                        split_batch_by=split_batch_by)
+
+
+class InputIteratorMultiWorkerTest(
+    multi_worker_test_base.MultiWorkerTestBase, InputIteratorTestBase,
+    parameterized.TestCase):
+
+  def _cpu_devices(self):
+    return [
+        ("/job:worker/replica:0/task:0",
+         ["/job:worker/replica:0/task:0/device:CPU:0"]),
+        ("/job:worker/replica:0/task:1",
+         ["/job:worker/replica:0/task:1/device:CPU:0"])]
+
+  def _cpu_and_one_gpu_devices(self):
+    return [
+        ("/job:worker/replica:0/task:0", [
+            "/job:worker/replica:0/task:0/device:GPU:0",
+            "/job:worker/replica:0/task:0/device:CPU:0"
+        ]),
+        ("/job:worker/replica:0/task:1", [
+            "/job:worker/replica:0/task:1/device:GPU:0",
+            "/job:worker/replica:0/task:1/device:CPU:0"
+        ])
+    ]
+
+  @combinations.generate(combinations.combine(
+      mode=["graph"],
+      input_type=["input_fn", "dataset"]))
+  def testOneDevicePerWorker(self, input_type):
+    worker_devices = self._cpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
+      dataset_fn = lambda: dataset_ops.Dataset.range(4)
+      self._test_iterator(input_type, dataset_fn, worker_devices,
+                          [[0, 0], [1, 1], [2, 2], [3, 3]], sess)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testTwoDevicesPerWorker(self, input_type):
+    worker_devices = self._cpu_and_one_gpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
+      dataset_fn = lambda: dataset_ops.Dataset.range(4)
+      self._test_iterator(input_type, dataset_fn, worker_devices,
+                          [[0, 1, 0, 1], [2, 3, 2, 3]], sess)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph"],
+      input_type=["input_fn", "dataset"]))
+  def testTupleDataset(self, input_type):
+    worker_devices = self._cpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
+      def dataset_fn():
+        dataset1 = dataset_ops.Dataset.range(4)
+        dataset2 = dataset_ops.Dataset.range(4).map(lambda x: x**2)
+        return dataset_ops.Dataset.zip((dataset1, dataset2))
+
+      expected_values = [[(i, i**2), (i, i**2)] for i in range(0, 4)]
+      self._test_iterator(input_type, dataset_fn, worker_devices,
+                          expected_values, sess)
+
+
+class SplitDatasetBatchTest(test.TestCase):
+
+  def testBatchDataset(self):
+    dataset = dataset_ops.Dataset.range(100).batch(20)
+    split_batch_by = 2
+    result_dataset = input_lib._split_dataset_batch(dataset, split_batch_by)
+    expected_values = [range(i, i+10) for i in range(0, 100, 10)]
+    result = [self.evaluate(el) for el in result_dataset]
+    self.assertAllEqual(expected_values, result)
+
+  def testMapAndBatchDataset(self):
+    dataset = dataset_ops.Dataset.range(100)
+    dataset = dataset.apply(batching.map_and_batch(lambda x: x, 20))
+    split_batch_by = 2
+    result_dataset = input_lib._split_dataset_batch(dataset, split_batch_by)
+    expected_values = [range(i, i+10) for i in range(0, 100, 10)]
+    result = [self.evaluate(el) for el in result_dataset]
+    self.assertAllEqual(expected_values, result)
+
+  def testPrefetchDataset(self):
+    dataset = dataset_ops.Dataset.range(100).batch(20).prefetch(1)
+    split_batch_by = 2
+    result_dataset = input_lib._split_dataset_batch(dataset, split_batch_by)
+    expected_values = [range(i, i+10) for i in range(0, 100, 10)]
+    result = [self.evaluate(el) for el in result_dataset]
+    self.assertAllEqual(expected_values, result)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_correctness_test.py b/tensorflow/contrib/distribute/python/keras_correctness_test.py
index 3abcdde65b2ecf79a9c7f26dd4c6d1325dd2c5d9..3abdee2c0ed8ebe60876a6d43bb90d8f4b7a5052 100644
--- a/tensorflow/contrib/distribute/python/keras_correctness_test.py
+++ b/tensorflow/contrib/distribute/python/keras_correctness_test.py
@@ -34,6 +34,8 @@ from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_de
 from tensorflow.python.training import gradient_descent
 
 _RANDOM_SEED = 1337
+_EVAL_STEPS = 20
+_GLOBAL_BATCH_SIZE = 64
 
 # Note: Please make sure the tests in this file are also covered in
 # keras_backward_compat_test for features that are supported with both APIs.
@@ -61,12 +63,23 @@ def all_strategy_combinations_with_graph_mode():
 
 
 def strategy_and_input_combinations():
+  def cnn_model_with_batch_norm(**kwargs):
+    return _create_cnn_model(with_batch_norm=True, **kwargs)
+
   return (
       combinations.times(
           combinations.combine(distribution=all_strategies),
           combinations.combine(mode=['graph', 'eager'],
                                use_numpy=[True, False],
-                               use_validation_data=[True, False])))
+                               use_validation_data=[True, False]),
+          combinations.combine(model_with_data=[
+              ModelWithData('dnn', _create_dnn_model, _dnn_training_data),
+              ModelWithData('cnn', _create_cnn_model, _cnn_training_data),
+              ModelWithData('cnn_batch_norm',
+                            cnn_model_with_batch_norm,
+                            _cnn_training_data,
+                            with_batch_norm=True),
+          ])))
 
 
 class MaybeDistributionScope(object):
@@ -87,7 +100,35 @@ class MaybeDistributionScope(object):
       self._scope = None
 
 
-def _create_dnn_model(weights=None, distribution=None, compile_model=True):
+class ModelWithData(object):
+  """An object giving a good name in combinations.
+
+  The model_fn must take two arguments: initial_weights and distribution.
+  """
+
+  def __init__(self, name, model_fn, data_fn, with_batch_norm=False):
+    self.name = name
+    self.model_fn = model_fn
+    self.data_fn = data_fn
+    self.with_batch_norm = with_batch_norm
+
+  def __repr__(self):
+    return self.name
+
+
+def _dnn_training_data():
+  # TODO(xiejw): Change this back to 10000, once we support final partial
+  # batch.
+  num_samples = 9984
+  x_train = np.random.rand(num_samples, 1)
+  y_train = 3 * x_train
+  x_train = x_train.astype('float32')
+  y_train = y_train.astype('float32')
+  x_predict = [[1.], [2.], [3.], [4.]]
+  return x_train, y_train, x_predict
+
+
+def _create_dnn_model(initial_weights=None, distribution=None):
   with MaybeDistributionScope(distribution):
     # We add few non-linear layers to make it non-trivial.
     model = keras.Sequential()
@@ -96,17 +137,60 @@ def _create_dnn_model(weights=None, distribution=None, compile_model=True):
     model.add(keras.layers.Dense(10, activation='relu'))
     model.add(keras.layers.Dense(1))
 
-    if weights:
-      model.set_weights(weights)
+    if initial_weights:
+      model.set_weights(initial_weights)
 
-    if compile_model:
-      model.compile(
-          loss=keras.losses.mean_squared_error,
-          optimizer=gradient_descent_keras.SGD(0.5),
-          metrics=['mse'])
+    model.compile(
+        loss=keras.losses.mean_squared_error,
+        optimizer=gradient_descent_keras.SGD(0.5),
+        metrics=['mse'])
     return model
 
 
+def _cnn_training_data(count=_GLOBAL_BATCH_SIZE * _EVAL_STEPS,
+                       shape=(28, 28, 3), num_classes=10):
+  centers = np.random.randn(num_classes, *shape)
+
+  features = []
+  labels = []
+  for _ in range(count):
+    label = np.random.randint(0, num_classes, size=1)[0]
+    offset = np.random.normal(loc=0, scale=0.1, size=np.prod(shape))
+    offset = offset.reshape(shape)
+    labels.append(label)
+    features.append(centers[label] + offset)
+
+  x_train = np.asarray(features, dtype=np.float32)
+  y_train = np.asarray(labels, dtype=np.float32).reshape((count, 1))
+  x_predict = x_train
+  return x_train, y_train, x_predict
+
+
+def _create_cnn_model(initial_weights=None, distribution=None,
+                      with_batch_norm=False):
+  with MaybeDistributionScope(distribution):
+    image = keras.layers.Input(shape=(28, 28, 3), name='image')
+    c1 = keras.layers.Conv2D(
+        name='conv1', filters=16, kernel_size=(3, 3), strides=(4, 4))(
+            image)
+    if with_batch_norm:
+      c1 = keras.layers.BatchNormalization(name='bn1')(c1)
+    c1 = keras.layers.MaxPooling2D(pool_size=(2, 2))(c1)
+    logits = keras.layers.Dense(
+        10, activation='softmax', name='pred')(
+            keras.layers.Flatten()(c1))
+    model = keras.Model(inputs=[image], outputs=[logits])
+
+    if initial_weights:
+      model.set_weights(initial_weights)
+
+    model.compile(
+        optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.1),
+        loss='sparse_categorical_crossentropy',
+        metrics=['sparse_categorical_accuracy'])
+  return model
+
+
 def batch_wrapper(dataset, batch_size, distribution, repeat=None):
   if repeat:
     dataset = dataset.repeat(repeat)
@@ -133,7 +217,7 @@ def get_correctness_test_inputs(use_numpy, use_validation_data,
                                 with_distribution, x_train, y_train, x_predict):
   """Generates the inputs for correctness check when enable Keras with DS."""
   training_epochs = 2
-  global_batch_size = 64
+  global_batch_size = _GLOBAL_BATCH_SIZE
   batch_size = get_batch_size(global_batch_size, with_distribution)
 
   if use_numpy:
@@ -158,6 +242,11 @@ def get_correctness_test_inputs(use_numpy, use_validation_data,
         'x': np.array(x_predict, dtype=np.float32),
     }
   else:
+    if len(x_train) < _GLOBAL_BATCH_SIZE * _EVAL_STEPS:
+      # Currently, we cannot detech the size of a dataset. So, the eval steps is
+      # hard coded.
+      raise ValueError('x_train must have at least '
+                       '_GLOBAL_BATCH_SIZE * _EVAL_STEPS samples')
     # For dataset inputs, we do not pass batch_size to
     # keras.fit/evaluate/predict. The batch size is part of the dataset.
     train_dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
@@ -183,7 +272,7 @@ def get_correctness_test_inputs(use_numpy, use_validation_data,
           'batch_size': None,
           'x': x,
           'y': None,
-          'steps': 20,
+          'steps': _EVAL_STEPS,
       }
 
     predict_batch_size = get_batch_size(len(x_predict), with_distribution)
@@ -199,8 +288,8 @@ def get_correctness_test_inputs(use_numpy, use_validation_data,
 
 
 def fit_eval_and_predict(
-    initial_weights, input_fn, distribution=None):
-  model = _create_dnn_model(initial_weights, distribution)
+    initial_weights, input_fn, model_fn, distribution=None):
+  model = model_fn(initial_weights=initial_weights, distribution=distribution)
   training_inputs, eval_inputs, predict_inputs = input_fn(distribution)
 
   result = {}
@@ -351,7 +440,8 @@ class TestDistributionStrategyCorrectness(test.TestCase,
       self.assertEqual(outs[2], 0.)
 
   @combinations.generate(strategy_and_input_combinations())
-  def test_correctness(self, distribution, use_numpy, use_validation_data):
+  def test_correctness(self, distribution, use_numpy, use_validation_data,
+                       model_with_data):
     if self._should_skip_tpu_with_eager(distribution):
       self.skipTest('TPUStrategy does not support eager mode now.')
 
@@ -366,21 +456,15 @@ class TestDistributionStrategyCorrectness(test.TestCase,
       np.random.seed(_RANDOM_SEED)
       random_seed.set_random_seed(_RANDOM_SEED)
 
+      model_fn, data_fn = model_with_data.model_fn, model_with_data.data_fn
       # Train, eval, and predict datasets are created with the same input numpy
       # arrays.
-      # TODO(xiejw): Change this back to 10000, once we support final partial
-      # batch.
-      num_samples = 9984
-      x_train = np.random.rand(num_samples, 1)
-      y_train = 3 * x_train
-      x_train = x_train.astype('float32')
-      y_train = y_train.astype('float32')
-      x_predict = [[1.], [2.], [3.], [4.]]
+      x_train, y_train, x_predict = data_fn()
 
       # The model is built once and the initial weights are saved.
       # This is used to initialize the model for both the distribution and
       # non-distribution run.
-      model = _create_dnn_model(compile_model=False)
+      model = model_fn()
       initial_weights = model.get_weights()
 
       def input_fn(dist):
@@ -388,11 +472,22 @@ class TestDistributionStrategyCorrectness(test.TestCase,
             use_numpy, use_validation_data, dist, x_train, y_train, x_predict)
 
       results_with_ds = fit_eval_and_predict(
-          initial_weights, input_fn=input_fn, distribution=distribution)
+          initial_weights, input_fn=input_fn, model_fn=model_fn,
+          distribution=distribution)
       results_without_ds = fit_eval_and_predict(
-          initial_weights, input_fn=input_fn, distribution=None)
-      compare_results(results_with_ds, results_without_ds, distribution,
-                      testcase=self)
+          initial_weights, input_fn=input_fn, model_fn=model_fn,
+          distribution=None)
+
+      # First, special case, for multi-replica distributed training, batch norm
+      # is not aggregated globally. So it is expected to have different weights.
+      if (model_with_data.with_batch_norm and
+          distribution.num_replicas_in_sync > 1):
+        with self.assertRaises(AssertionError):
+          compare_results(results_with_ds, results_without_ds, distribution,
+                          testcase=self)
+      else:
+        compare_results(results_with_ds, results_without_ds, distribution,
+                        testcase=self)
 
   @combinations.generate(all_strategy_combinations_with_graph_mode())
   def test_dynamic_lr(self, distribution):
@@ -403,13 +498,9 @@ class TestDistributionStrategyCorrectness(test.TestCase,
       np.random.seed(_RANDOM_SEED)
       random_seed.set_random_seed(_RANDOM_SEED)
 
-      # TODO(xiejw): Change this back to 10000, once we support final partial
-      # batch.
-      num_samples = 9984
-      x_train = np.random.rand(num_samples, 1).astype('float32')
-      y_train = 3 * x_train
+      x_train, y_train, _ = _dnn_training_data()
 
-      model = _create_dnn_model(compile_model=False)
+      model = _create_dnn_model()
       initial_weights = model.get_weights()
 
       update_freq = None
@@ -439,9 +530,11 @@ class TestDistributionStrategyCorrectness(test.TestCase,
         return training_inputs, eval_inputs, predict_inputs
 
       results_with_ds = fit_eval_and_predict(
-          initial_weights, input_fn=input_fn, distribution=distribution)
+          initial_weights, input_fn=input_fn, model_fn=_create_dnn_model,
+          distribution=distribution)
       results_without_ds = fit_eval_and_predict(
-          initial_weights, input_fn=input_fn, distribution=None)
+          initial_weights, input_fn=input_fn, model_fn=_create_dnn_model,
+          distribution=None)
       compare_results(results_with_ds, results_without_ds, distribution,
                       testcase=self)
 
diff --git a/tensorflow/contrib/distribute/python/keras_test.py b/tensorflow/contrib/distribute/python/keras_test.py
index b99d11b923155b1362b434ffb82a96a11b08d352..40916afcfaa8c99dc6e7400f0c88aee2f668a46c 100644
--- a/tensorflow/contrib/distribute/python/keras_test.py
+++ b/tensorflow/contrib/distribute/python/keras_test.py
@@ -245,6 +245,18 @@ def all_strategy_combinations():
   return strategy_minus_tpu_combinations() + tpu_strategy_combinations()
 
 
+def all_strategy_combinations_minus_default():
+  strategy_minus_default_combinations = combinations.combine(
+      distribution=[
+          combinations.one_device_strategy,
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph', 'eager'])
+  return strategy_minus_default_combinations + tpu_strategy_combinations()
+
+
 # TODO(priyag): Add v2 optimizers here.
 def strategy_and_optimizer_combinations():
   return combinations.times(
@@ -417,15 +429,6 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
 class TestDistributionStrategyWithNumpyArrays(test.TestCase,
                                               parameterized.TestCase):
 
-  @combinations.generate(strategy_for_numpy_input_combinations())
-  def test_creating_var_with_numpy_arrays(self, distribution):
-    with self.cached_session():
-      x = np.asarray(np.random.random((64, 3)), dtype=np.float32)
-      var_x = distributed_training_utils.get_var_for_numpy(distribution, x)
-      val = self.evaluate(var_x.value())
-      # Verify that the numpy value is copied to the variable.
-      self.assertAllEqual(x, val)
-
   @combinations.generate(strategy_for_numpy_input_combinations())
   def test_calculating_input_params_no_steps_no_batch_size(self, distribution):
     # Calculate the per_replica_batch_size scaling factor for strategies
@@ -564,26 +567,26 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         metrics = ['mae']
         model.compile(optimizer, loss, metrics=metrics)
 
-      inputs = np.zeros((64, 3), dtype=np.float32)
-      targets = np.zeros((64, 4), dtype=np.float32)
+        inputs = np.zeros((64, 3), dtype=np.float32)
+        targets = np.zeros((64, 4), dtype=np.float32)
 
-      # Call fit with validation data
-      model.fit(inputs, targets, epochs=1, batch_size=2, verbose=0,
-                validation_data=(inputs, targets))
+        # Call fit with validation data
+        model.fit(inputs, targets, epochs=1, batch_size=2, verbose=0,
+                  validation_data=(inputs, targets))
 
-      # TODO(anjalisridhar): We need tests for when the batch size and steps are
-      # smaller and results in a 0 batch_size and steps value.
-      model.evaluate(inputs, targets)
-      # with steps
-      model.evaluate(inputs, targets, steps=2)
-      # with batch_size
-      model.evaluate(inputs, targets, batch_size=8)
+        # TODO(anjalisridhar): We need tests for when the batch size and steps
+        # are smaller and results in a 0 batch_size and steps value.
+        model.evaluate(inputs, targets)
+        # with steps
+        model.evaluate(inputs, targets, steps=2)
+        # with batch_size
+        model.evaluate(inputs, targets, batch_size=8)
 
-      model.predict(inputs)
-      # with steps
-      model.predict(inputs, steps=2)
-      # with batch_size
-      model.predict(inputs, batch_size=8)
+        model.predict(inputs)
+        # with steps
+        model.predict(inputs, steps=2)
+        # with batch_size
+        model.predict(inputs, batch_size=8)
 
   @combinations.generate(strategy_for_numpy_input_combinations())
   def test_calling_model_with_nested_numpy_arrays(self, distribution):
@@ -1149,5 +1152,37 @@ class TestDistributionStrategyWithNormalizationLayer(
       np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
 
 
+class TestDistributionStrategyValidation(test.TestCase,
+                                         parameterized.TestCase):
+
+  @combinations.generate(all_strategy_combinations_minus_default())
+  def test_layer_outside_scope(self, distribution):
+    with self.cached_session():
+      with self.assertRaisesRegexp(
+          ValueError, 'was not created in the distribution strategy'):
+        x = keras.layers.Input(shape=(3,), name='input')
+        y = keras.layers.Dense(4, name='dense')(x)
+        with distribution.scope():
+          model = keras.Model(x, y)
+          optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+          loss = 'mse'
+          metrics = ['mae', keras.metrics.CategoricalAccuracy()]
+          model.compile(optimizer, loss, metrics=metrics)
+
+  @combinations.generate(all_strategy_combinations_minus_default())
+  def test_model_outside_scope(self, distribution):
+    with self.cached_session():
+      with self.assertRaisesRegexp(
+          ValueError, 'was not created in the distribution strategy'):
+        x = keras.layers.Input(shape=(3,), name='input')
+        y = keras.layers.Dense(4, name='dense')(x)
+        model = keras.Model(x, y)
+        with distribution.scope():
+          optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+          loss = 'mse'
+          metrics = ['mae', keras.metrics.CategoricalAccuracy()]
+          model.compile(optimizer, loss, metrics=metrics)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/distribute/python/metrics_v1_test.py b/tensorflow/contrib/distribute/python/metrics_v1_test.py
index 32a0d199434e0627122fd4e47cf8894079ef3a1e..7472f6dde45a75633133b33b181db4d1a7aba287 100644
--- a/tensorflow/contrib/distribute/python/metrics_v1_test.py
+++ b/tensorflow/contrib/distribute/python/metrics_v1_test.py
@@ -122,7 +122,6 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
         batches_per_update = distribution.num_replicas_in_sync
 
       self.evaluate(iterator.initializer)
-      self.evaluate(distribution.initialize())
       self.evaluate(variables.local_variables_initializer())
 
       batches_consumed = 0
@@ -136,8 +135,6 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
         if batches_consumed >= 4:  # Consume 4 input batches in total.
           break
 
-      self.evaluate(distribution.finalize())
-
   @combinations.generate(all_combinations() + tpu_combinations())
   def testMean(self, distribution):
     def _dataset_fn():
diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
index 824c4b09371fcc8d590f2d2b2be8f39b4a585b27..b0e24a53f6f6a9867150f4b81a0bd3213757e0b3 100644
--- a/tensorflow/contrib/distribute/python/minimize_loss_test.py
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -75,7 +75,6 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         return distribution.run_steps_on_dataset(
             step_fn, iterator, iterations=2).run_op
 
-      self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
         with self.cached_session() as sess:
           run_step = sess.make_callable(run_step())
@@ -84,12 +83,9 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       weights, biases = [], []
       for _ in range(5):
         run_step()
-
         weights.append(self.evaluate(layer.kernel))
         biases.append(self.evaluate(layer.bias))
 
-      self.evaluate(distribution.finalize())
-
       error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
       is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
       self.assertTrue(is_not_increasing)
@@ -152,7 +148,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
     # `distribution.scope`.
     with variable_scope.variable_creator_scope(
         appending_creator), distribution.scope():
-      model_fn, dataset_fn, layer = minimize_loss_example(
+      model_fn, dataset_fn, _ = minimize_loss_example(
           optimizer_fn,
           use_bias=True,
           use_callable_loss=True,
@@ -169,16 +165,12 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         return distribution.run_steps_on_dataset(
             step_fn, iterator, iterations=1).run_op
 
-      self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
         with self.cached_session() as sess:
           run_step = sess.make_callable(run_step())
       self.evaluate(variables_lib.global_variables_initializer())
-
       run_step()
 
-      self.evaluate(distribution.finalize())
-
       def get_expected_variables(optimizer_fn, num_parameter_devices):
         variables_map = {
             "GradientDescent": ["dense/kernel", "dense/bias"],
@@ -241,7 +233,6 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         return distribution.run_steps_on_dataset(
             step_fn, iterator, iterations=1).run_op
 
-      self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
         with self.cached_session() as sess:
           run_step = sess.make_callable(run_step())
@@ -267,8 +258,6 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
               expected_moving_mean - averaged_batch_mean(i)) * (1.0 - momentum))
           self.assertNear(expected_moving_means[i], moving_means[i], 0.0001)
 
-      self.evaluate(distribution.finalize())
-
   @combinations.generate(
       combinations.times(
           combinations.combine(
@@ -335,7 +324,6 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         return distribution.run_steps_on_dataset(
             step_fn, iterator, iterations=1).run_op
 
-      self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
         with self.cached_session() as sess:
           run_step = sess.make_callable(run_step())
@@ -370,8 +358,6 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         # One of the mean loss reductions.
         self.assertNear(weight, 2 + 10.6, 0.0001)
 
-      self.evaluate(distribution.finalize())
-
   @combinations.generate(
       combinations.times(
           combinations.distributions_and_v1_optimizers(),
@@ -458,7 +444,6 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
             reduced=False, distribution=distribution)
         return (ctx.run_op, ctx.last_step_outputs["replica_loss_reduced"])
 
-      self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
         with self.cached_session() as sess:
           run_step = sess.make_callable(run_step())
@@ -471,8 +456,6 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         weights.append(self.evaluate(layer.kernel))
         biases.append(self.evaluate(layer.bias))
 
-      self.evaluate(distribution.finalize())
-
       loss_is_not_increasing = all(y <= x for x, y in zip(losses, losses[1:]))
       self.assertTrue(loss_is_not_increasing)
 
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index 71e50b83b079bc73a7b178356f0f26adbd98638f..2e23a51ee56ed1388a4387a51342aabce6d24bed 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -21,8 +21,8 @@ from __future__ import print_function
 import functools
 
 from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import mirrored_strategy
-from tensorflow.python.distribute import values
 
 
 # pylint: disable=protected-access,invalid-name
@@ -48,8 +48,8 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
   distributed environment.
 
   There are several important concepts for distributed TensorFlow, e.g.
-  `client`, `job`, 'task', `cluster`, `in-graph replication` and
-  'synchronous training' and they have already been defined in the
+  `client`, `job`, `task`, `cluster`, `in-graph replication` and
+  `synchronous training` and they have already been defined in the
   [TensorFlow's documentation](https://www.tensorflow.org/deploy/distributed).
   The distribution strategy inherits these concepts as well and in addition to
   that we also clarify several more concepts:
@@ -104,6 +104,61 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
                                 auto_shard_dataset)
     super(MirroredStrategy, self).__init__(extended)
 
+  # Override to change the documentation to reflect the different handling of
+  # global vs. local batch size between core and contrib.
+  def make_dataset_iterator(self, dataset):  # pylint: disable=useless-super-delegation
+    """Makes an iterator for input provided via `dataset`.
+
+    NOTE: The batch size of the `dataset` argument is treated differently for
+    this contrib version of `MirroredStrategy`.
+
+    Data from the given dataset will be distributed evenly across all the
+    compute replicas. We will assume that the input dataset is batched by the
+    per-replica batch size.
+
+    The user could also use `make_input_fn_iterator` if they want to
+    customize which input is fed to which replica/worker etc.
+
+    Args:
+      dataset: `tf.data.Dataset` that will be distributed evenly across all
+        replicas.
+
+    Returns:
+      An `tf.distribute.InputIterator` which returns inputs for each step of the
+      computation.  User should call `initialize` on the returned iterator.
+    """
+    return super(MirroredStrategy, self).make_dataset_iterator(dataset)
+
+  # Override to change the documentation to reflect the different handling of
+  # global vs. local batch size between core and contrib.
+  def experimental_make_numpy_iterator(  # pylint: disable=useless-super-delegation
+      self, numpy_input, batch_size, num_epochs=1, shuffle=1024, session=None):
+    """Makes an iterator for input provided via a nest of numpy arrays.
+
+    NOTE: The `batch_size` argument here has different behavior for this
+    contrib version of `MirroredStrategy`.
+
+    Args:
+      numpy_input: A nest of NumPy input arrays that will be distributed evenly
+        across all replicas.
+      batch_size: The number of entries from the array we should consume in one
+        step of the computation, across all replicas. This is the per-replica
+        batch size. The global batch size will be this times
+        `num_replicas_in_sync`.
+      num_epochs: The number of times to iterate through the examples. A value
+        of `None` means repeat forever.
+      shuffle: Size of buffer to use for shuffling the input examples.
+        Use `None` to disable shuffling.
+      session: (TensorFlow v1.x graph execution only) A session used for
+        initialization.
+
+    Returns:
+      An `tf.distribute.InputIterator` which returns inputs for each step of the
+      computation.  User should call `initialize` on the returned iterator.
+    """
+    return super(MirroredStrategy, self).experimental_make_numpy_iterator(
+        numpy_input, batch_size, num_epochs, shuffle, session)
+
 
 class MirroredExtended(CoreMirroredExtended):
   """Implementation of (contrib) MirroredStrategy."""
@@ -135,14 +190,14 @@ class MirroredExtended(CoreMirroredExtended):
     Returns:
       An `InputIterator` which returns inputs for each step of the computation.
     """
-    return values.DatasetIterator(dataset, self._input_workers)
+    return input_lib.DatasetIterator(dataset, self._input_workers)
 
   def _distribute_dataset(self, dataset_fn):
     if self._local_mode:
-      return values.PerReplicaDataset(
+      return input_lib.PerReplicaDataset(
           self._call_dataset_fn(dataset_fn), self._input_workers, 0)
     else:
-      return values.MultiWorkerDataset(
+      return input_lib.MultiWorkerDataset(
           functools.partial(self._call_dataset_fn, dataset_fn),
           self._input_workers,
           auto_shard=self._auto_shard_dataset)
@@ -150,4 +205,5 @@ class MirroredExtended(CoreMirroredExtended):
   # TODO(priyag): Delete this once all strategies use global batch size.
   @property
   def _global_batch_size(self):
+    """The contrib version of Mirrored strategy uses per-replica batch size."""
     return False
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
index 9cbc34412da2d498fa1f0624f5cba466f663c194..59d711ae0197716f1af31c24c5748bce543758dc 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -66,8 +66,10 @@ GPU_TEST = "test_gpu" in sys.argv[0]
         combinations.core_mirrored_strategy_with_gpu_and_cpu,
         combinations.core_mirrored_strategy_with_two_gpus],
     mode=["graph", "eager"]))
-class MirroredTwoDeviceDistributionTest(strategy_test_lib.DistributionTestBase,
-                                        parameterized.TestCase):
+class MirroredTwoDeviceDistributionTest(
+    strategy_test_lib.DistributionTestBase,
+    strategy_test_lib.TwoDeviceDistributionTestBase,
+    parameterized.TestCase):
 
   def testMinimizeLoss(self, distribution):
     if context.executing_eagerly():
@@ -114,9 +116,30 @@ class MirroredTwoDeviceDistributionTest(strategy_test_lib.DistributionTestBase,
     self._test_input_fn_iterator(iterator, distribution.extended.worker_devices,
                                  expected_values)
 
+  def testNumpyIterator(self, distribution):
+    self._test_numpy_iterator(distribution)
+
   def testGlobalStepUpdate(self, distribution):
     self._test_global_step_update(distribution)
 
+  def testAllReduceSum(self, distribution):
+    self._test_all_reduce_sum(distribution)
+
+  def testAllReduceSumGradients(self, distribution):
+    self._test_all_reduce_sum_gradients(distribution)
+
+  def testAllReduceSumGradientTape(self, distribution):
+    self._test_all_reduce_sum_gradient_tape(distribution)
+
+  def testAllReduceMean(self, distribution):
+    self._test_all_reduce_mean(distribution)
+
+  def testAllReduceMeanGradients(self, distribution):
+    self._test_all_reduce_mean_gradients(distribution)
+
+  def testAllReduceMeanGradientTape(self, distribution):
+    self._test_all_reduce_mean_gradient_tape(distribution)
+
 
 def one_device_combinations():
   return combinations.combine(
@@ -128,25 +151,42 @@ def one_device_combinations():
       mode=["graph", "eager"])
 
 
+@combinations.generate(one_device_combinations())
 class MirroredOneDeviceDistributionTest(
     strategy_test_lib.DistributionTestBase,
+    strategy_test_lib.OneDeviceDistributionTestBase,
     parameterized.TestCase):
 
-  @combinations.generate(one_device_combinations())
   def testMinimizeLoss(self, distribution):
     if context.executing_eagerly():
       self._test_minimize_loss_eager(distribution)
     else:
       self._test_minimize_loss_graph(distribution)
 
-  @combinations.generate(one_device_combinations())
   def testReplicaId(self, distribution):
     self._test_replica_id(distribution)
 
-  @combinations.generate(one_device_combinations())
   def testCallAndMergeExceptions(self, distribution):
     self._test_call_and_merge_exceptions(distribution)
 
+  def testAllReduceSum(self, distribution):
+    self._test_all_reduce_sum(distribution)
+
+  def testAllReduceSumGradients(self, distribution):
+    self._test_all_reduce_sum_gradients(distribution)
+
+  def testAllReduceSumGradientTape(self, distribution):
+    self._test_all_reduce_sum_gradient_tape(distribution)
+
+  def testAllReduceMean(self, distribution):
+    self._test_all_reduce_mean(distribution)
+
+  def testAllReduceMeanGradients(self, distribution):
+    self._test_all_reduce_mean_gradients(distribution)
+
+  def testAllReduceMeanGradientTape(self, distribution):
+    self._test_all_reduce_mean_gradient_tape(distribution)
+
 
 class MirroredStrategyVariableCreatorStackTest(
     test.TestCase, parameterized.TestCase):
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py
index 700751d68c5a517876bf41d09223e4c1a40b50c8..836cb7cc41b62352fd69a4a209d483ccf0fc498e 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
+from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.distribute import values
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -49,10 +51,11 @@ class OneDeviceExtended(distribute_lib.DistributionStrategyExtended):
     super(OneDeviceExtended, self).__init__(container_strategy)
     self._device = device
     self._default_device = device
-    worker = device_util.canonicalize("/device:CPU:0")
-    worker_device_pairs = [(worker, [self._device])]
+    self._input_device = device_util.canonicalize("/device:CPU:0")
+    worker_device_pairs = [(self._input_device, [self._device])]
     device_map = values.SingleDeviceMap(device)
-    self._input_workers = values.InputWorkers(device_map, worker_device_pairs)
+    self._input_workers = input_lib.InputWorkers(
+        device_map, worker_device_pairs)
 
   def _create_variable(self, next_creator, *args, **kwargs):
     colocate_with = kwargs.pop("colocate_with", None)
@@ -67,19 +70,23 @@ class OneDeviceExtended(distribute_lib.DistributionStrategyExtended):
 
   def _make_dataset_iterator(self, dataset):
     """Make iterator from dataset without splitting the batch."""
-    return values.DatasetIterator(dataset, self._input_workers)
+    return input_lib.DatasetIterator(dataset, self._input_workers)
 
   def _distribute_dataset(self, dataset_fn):
-    return values.PerReplicaDataset(
+    return input_lib.PerReplicaDataset(
         self._call_dataset_fn(dataset_fn), self._input_workers, 0)
 
   def _make_input_fn_iterator(
       self,
       input_fn,
       replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
-    return values.InputFunctionIterator(
+    return input_lib.InputFunctionIterator(
         input_fn, self._input_workers, [distribute_lib.InputContext()])
 
+  def _experimental_make_numpy_dataset(self, numpy_input, session):
+    return numpy_dataset.one_host_numpy_dataset(
+        numpy_input, self._input_device, session)
+
   def _broadcast_to(self, tensor, destinations):
     del destinations
     return tensor
@@ -91,7 +98,7 @@ class OneDeviceExtended(distribute_lib.DistributionStrategyExtended):
       initial_loop_values = {}
     initial_loop_values = nest.flatten(initial_loop_values)
 
-    ctx = values.MultiStepContext()
+    ctx = input_lib.MultiStepContext()
     def body(i, *args):
       """A wrapper around `fn` to create the while loop body."""
       del args
@@ -192,6 +199,7 @@ class OneDeviceExtended(distribute_lib.DistributionStrategyExtended):
   # TODO(priyag): Delete this once all strategies use global batch size.
   @property
   def _global_batch_size(self):
+    """Global and per-replica batching are equivalent for OneDeviceStrategy."""
     return True
 
 
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy_test.py b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
index d46cd6f529e363f76bfa2b22339add63530cfde8..f81466a6c75f1cf287cdb00917872f77383c615e 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
@@ -25,7 +25,9 @@ from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 
 
-class OneDeviceStrategyTest(strategy_test_lib.DistributionTestBase):
+class OneDeviceStrategyTest(
+    strategy_test_lib.DistributionTestBase,
+    strategy_test_lib.OneDeviceDistributionTestBase):
 
   def _get_distribution_strategy(self):
     return one_device_strategy.OneDeviceStrategy("/device:CPU:0")
@@ -57,6 +59,28 @@ class OneDeviceStrategyTest(strategy_test_lib.DistributionTestBase):
     self._test_input_fn_iterator(
         iterator, d.extended.worker_devices, expected_values)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testNumpyIterator(self):
+    self._test_numpy_iterator(self._get_distribution_strategy())
+
+  def testAllReduceSum(self):
+    self._test_all_reduce_sum(self._get_distribution_strategy())
+
+  def testAllReduceSumGradients(self):
+    self._test_all_reduce_sum_gradients(self._get_distribution_strategy())
+
+  def testAllReduceSumGradientTape(self):
+    self._test_all_reduce_sum_gradient_tape(self._get_distribution_strategy())
+
+  def testAllReduceMean(self):
+    self._test_all_reduce_mean(self._get_distribution_strategy())
+
+  def testAllReduceMeanGradients(self):
+    self._test_all_reduce_mean_gradients(self._get_distribution_strategy())
+
+  def testAllReduceMeanGradientTape(self):
+    self._test_all_reduce_mean_gradient_tape(self._get_distribution_strategy())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy.py b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
index a6e924b509fc15c04e15c6a5caeb6caed638395d..0cefef7545f3be1f8116fea3ffdb0e91888159de 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
@@ -18,34 +18,24 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import copy
-
-from tensorflow.contrib.distribute.python import mirrored_strategy
-from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
-from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import multi_worker_util
-from tensorflow.python.distribute import values
-from tensorflow.python.eager import context
-from tensorflow.python.framework import device as tf_device
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import device_setter
-from tensorflow.python.util import nest
+from tensorflow.python.distribute import input_lib
+from tensorflow.python.distribute import parameter_server_strategy
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
+
+# pylint: disable=protected-access,invalid-name,line-too-long
+CoreParameterServerStrategy = parameter_server_strategy.ParameterServerStrategy
+CoreParameterServerExtended = parameter_server_strategy.ParameterServerStrategyExtended
 
-_LOCAL_CPU = "/device:CPU:0"
-_LOCAL_GPU_0 = "/device:GPU:0"
+# pylint: enable=protected-access,invalid-name,line-too-long
 
 
-# TODO(yuefengz): maybe cache variables on local CPU.
-# TODO(yuefengz): we may want to set session options to disallow communication
-# between workers.
 class ParameterServerStrategy(distribute_lib.DistributionStrategy):
   """A parameter server DistributionStrategy.
 
+  *** contrib version ***
+
   This strategy class works for both local training and between-graph replicated
   training for multiple workers. If `cluster_spec` is specified, either passed
   in to __init__() method or parsed from the
@@ -99,437 +89,84 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
     super(ParameterServerStrategy, self).__init__(
         ParameterServerExtended(self, num_gpus_per_worker))
 
+  # Override to change the documentation to reflect the different handling of
+  # global vs. local batch size between core and contrib.
+  def make_dataset_iterator(self, dataset):  # pylint: disable=useless-super-delegation
+    """Makes an iterator for input provided via `dataset`.
 
-class ParameterServerExtended(distribute_lib.DistributionStrategyExtended):
-  """Implementation of ParameterServerStrategy."""
+    NOTE: The batch size of the `dataset` argument is treated differently for
+    this contrib version of `ParameterServerStrategy`.
 
-  def __init__(self, container_strategy, num_gpus_per_worker):
-    super(ParameterServerExtended, self).__init__(container_strategy)
-    self._num_gpus_per_worker = num_gpus_per_worker
-    self._initialize_local(num_gpus_per_worker)
+    Data from the given dataset will be distributed evenly across all the
+    compute replicas. We will assume that the input dataset is batched by the
+    per-replica batch size.
 
-    # We typically don't need to do all-reduce in this strategy.
-    self._cross_device_ops = (
-        cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps(
-            reduce_to_device=_LOCAL_CPU))
-
-  def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec,
-                               task_type, task_id):
-    """Initialize devices for multiple workers.
-
-    It creates variable devices and compute devices. Variables and operations
-    will be assigned to them respectively. We have one compute device per
-    replica. The variable device is a device function or device string. The
-    default variable device assigns variables to parameter servers in a
-    round-robin fashion.
+    The user could also use `make_input_fn_iterator` if they want to
+    customize which input is fed to which replica/worker etc.
 
     Args:
-      num_gpus_per_worker: number of local GPUs or GPUs per worker.
-      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
-        cluster configurations.
-      task_type: the current task type.
-      task_id: the current task id.
+      dataset: `tf.data.Dataset` that will be distributed evenly across all
+        replicas.
 
-    Raises:
-      ValueError: if the cluster_spec doesn't have ps jobs.
+    Returns:
+      An `tf.distribute.InputIterator` which returns inputs for each step of the
+      computation.  User should call `initialize` on the returned iterator.
     """
-    assert cluster_spec
-    if not task_type or task_id is None:
-      raise ValueError("When `cluster_spec` is given, you must also specify "
-                       "`task_type` and `task_id`")
-    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
-
-    worker_device = "/job:%s/task:%d" % (self._task_type, self._task_id)
-
-    # Define compute devices which is a list of device strings and one for each
-    # replica. When there are GPUs, replicate operations on these GPUs.
-    # Otherwise, place operations on CPU.
-    if num_gpus_per_worker > 0:
-      compute_devices = tuple(
-          "%s/device:GPU:%d" % (worker_device, i)
-          for i in range(num_gpus_per_worker)
-      )
-    else:
-      compute_devices = (worker_device,)
-
-    self._device_map = values.ReplicaDeviceMap(compute_devices)
-    self._input_workers = values.InputWorkers(
-        self._device_map, [(worker_device, compute_devices)])
-
-    # In distributed mode, place variables on ps jobs in a round-robin fashion.
-    # Note that devices returned from `replica_device_setter` are not
-    # canonical and therefore we don't canonicalize all variable devices to
-    # make them consistent.
-    # TODO(yuefengz): support passing a strategy object to control variable
-    # assignment.
-    # TODO(yuefengz): merge the logic of replica_device_setter into this
-    # class.
-    num_ps_replicas = len(cluster_spec.as_dict().get("ps", []))
-    if num_ps_replicas == 0:
-      raise ValueError("The cluster spec needs to have `ps` jobs.")
-    self._variable_device = device_setter.replica_device_setter(
-        ps_tasks=num_ps_replicas,
-        worker_device=worker_device,
-        merge_devices=True,
-        cluster=cluster_spec)
-
-    # The `_parameter_devices` is needed for the `parameter_devices` property
-    # and is a list of all variable devices. Here parameter devices are all
-    # tasks of the "ps" job.
-    self._parameter_devices = tuple(map("/job:ps/task:{}".format,
-                                        range(num_ps_replicas)))
-
-    # Add a default device so that ops without specified devices will not end up
-    # on other workers.
-    self._default_device = worker_device
-
-    self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
-                                                task_id)
-    self._cluster_spec = cluster_spec
-    self._task_type = task_type
-    self._task_id = task_id
-
-    logging.info(
-        "Multi-worker ParameterServerStrategy with "
-        "cluster_spec = %r, task_type = %r, task_id = %r, "
-        "num_ps_replicas = %r, is_chief = %r, device_map = %r, "
-        "variable_device = %r", cluster_spec.as_dict(), task_type, task_id,
-        num_ps_replicas, self._is_chief, self._device_map,
-        self._variable_device)
-
-  def _initialize_local(self, num_gpus_per_worker):
-    """Initialize internal devices for local training."""
-    worker_device = device_util.canonicalize("/device:CPU:0")
-    # Define compute devices which is a list of device strings and one for each
-    # replica. When there are GPUs, replicate operations on these GPUs.
-    # Otherwise, place operations on CPU.
-    if num_gpus_per_worker > 0:
-      compute_devices = tuple(
-          map("/device:GPU:{}".format, range(num_gpus_per_worker)))
-    else:
-      compute_devices = (_LOCAL_CPU,)
-
-    self._device_map = values.ReplicaDeviceMap(compute_devices)
-    self._input_workers = values.InputWorkers(
-        self._device_map, [(worker_device, compute_devices)])
-
-    # If there is only one GPU, put everything on that GPU. Otherwise, place
-    # variables on CPU.
-    if num_gpus_per_worker == 1:
-      assert len(compute_devices) == 1
-      self._variable_device = _LOCAL_GPU_0
-      self._parameter_devices = (_LOCAL_GPU_0,)
-    else:
-      self._variable_device = _LOCAL_CPU
-      self._parameter_devices = (_LOCAL_CPU,)
-
-    self._is_chief = True
-    self._cluster_spec = None
-    self._task_type = None
-    self._task_id = None
-
-    logging.info(
-        "ParameterServerStrategy with compute_devices = %r, "
-        "variable_device = %r", compute_devices, self._variable_device)
-
-  def _validate_colocate_with_variable(self, colocate_with_variable):
-    values.validate_colocate(colocate_with_variable, self)
-
-  def _distribute_dataset(self, dataset_fn):
-    """Distributes the dataset to each local GPU."""
-    return values.PerReplicaDataset(
-        self._call_dataset_fn(dataset_fn), self._input_workers, 0,
-        prefetch_on_device=True)
-
-  def _make_dataset_iterator(self, dataset):
-    return values.DatasetIterator(dataset, self._input_workers,
-                                  self._num_replicas_in_sync)
-
-  def _make_input_fn_iterator(
-      self,
-      input_fn,
-      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
-    """Distributes the dataset to each local GPU."""
-    if self._cluster_spec:
-      input_pipeline_id = multi_worker_util.id_in_cluster(
-          self._cluster_spec, self._task_type, self._task_id)
-      num_input_pipelines = multi_worker_util.worker_count(
-          self._cluster_spec, self._task_type)
-    else:
-      input_pipeline_id = 0
-      num_input_pipelines = 1
-    input_context = distribute_lib.InputContext(
-        num_input_pipelines=num_input_pipelines,
-        input_pipeline_id=input_pipeline_id,
-        num_replicas_in_sync=self._num_replicas_in_sync)
-    return values.InputFunctionIterator(
-        input_fn, self._input_workers, [input_context])
-
-  def _broadcast_to(self, tensor, destinations):
-    # This is both a fast path for Python constants, and a way to delay
-    # converting Python values to a tensor until we know what type it
-    # should be converted to. Otherwise we have trouble with:
-    #   global_step.assign_add(1)
-    # since the `1` gets broadcast as an int32 but global_step is int64.
-    if isinstance(tensor, (float, int)):
-      return tensor
-    if not cross_device_ops_lib.check_destinations(destinations):
-      # TODO(josh11b): Use current logical device instead of 0 here.
-      destinations = values.LogicalDeviceSpec(
-          device_map=self._device_map, logical_device=0)
-    return self._cross_device_ops.broadcast(tensor, destinations)
-
-  def _allow_variable_partition(self):
-    return not context.executing_eagerly()
-
-  # TODO(yuefengz): not all ops in device_setter.STANDARD_PS_OPS will go through
-  # this creator, such as "MutableHashTable".
-  def _create_variable(self, next_creator, *args, **kwargs):
-    if self._num_replicas_in_sync > 1:
-      aggregation = kwargs.pop("aggregation", vs.VariableAggregation.NONE)
-      if aggregation not in (
-          vs.VariableAggregation.NONE,
-          vs.VariableAggregation.SUM,
-          vs.VariableAggregation.MEAN,
-          vs.VariableAggregation.ONLY_FIRST_REPLICA
-      ):
-        raise ValueError("Invalid variable aggregation mode: " + aggregation +
-                         " for variable: " + kwargs["name"])
-
-      def var_creator(*args, **kwargs):
-        """Create an AggregatingVariable and fix up collections."""
-        # Record what collections this variable should be added to.
-        collections = kwargs.pop("collections", None)
-        if collections is None:
-          collections = [ops.GraphKeys.GLOBAL_VARIABLES]
-        kwargs["collections"] = []
-
-        # Create and wrap the variable.
-        v = next_creator(*args, **kwargs)
-        wrapped = values.AggregatingVariable(
-            self._container_strategy(), v, aggregation)
-
-        # Add the wrapped variable to the requested collections.
-        # The handling of eager mode and the global step matches
-        # ResourceVariable._init_from_args().
-        if not context.executing_eagerly():
-          g = ops.get_default_graph()
-          # If "trainable" is True, next_creator() will add the contained
-          # variable to the TRAINABLE_VARIABLES collection, so we manually
-          # remove it and replace with the wrapper. We can't set "trainable"
-          # to False for next_creator() since that causes functions like
-          # implicit_gradients to skip those variables.
-          if kwargs.get("trainable", True):
-            collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
-            l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
-            l.remove(v)
-          g.add_to_collections(collections, wrapped)
-        elif ops.GraphKeys.GLOBAL_STEP in collections:
-          ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, wrapped)
-
-        return wrapped
-    else:
-      var_creator = next_creator
-
-    if "colocate_with" in kwargs:
-      with ops.device(None):
-        with ops.colocate_with(kwargs["colocate_with"]):
-          return var_creator(*args, **kwargs)
-
-    with ops.colocate_with(None, ignore_existing=True):
-      with ops.device(self._variable_device):
-        return var_creator(*args, **kwargs)
-
-  def _call_for_each_replica(self, fn, args, kwargs):
-    # pylint: disable=protected-access
-    return mirrored_strategy._call_for_each_replica(
-        self._container_strategy(), self._device_map, fn, args, kwargs)
+    return super(ParameterServerStrategy, self).make_dataset_iterator(dataset)
 
-  def _verify_destinations_not_different_worker(self, destinations):
-    if not self._cluster_spec:
-      return
-    if destinations is None:
-      return
-    for d in cross_device_ops_lib.get_devices_from(destinations):
-      d_spec = tf_device.DeviceSpec.from_string(d)
-      if d_spec.job == self._task_type and d_spec.task != self._task_id:
-        raise ValueError(
-            "Cannot reduce to another worker: %r, current worker is %r" %
-            (d, self._input_workers.worker_devices[0]))
+  # Override to change the documentation to reflect the different handling of
+  # global vs. local batch size between core and contrib.
+  def experimental_make_numpy_iterator(  # pylint: disable=useless-super-delegation
+      self, numpy_input, batch_size, num_epochs=1, shuffle=1024, session=None):
+    """Makes an iterator for input provided via a nest of numpy arrays.
 
-  def _reduce_to(self, reduce_op, value, destinations):
-    self._verify_destinations_not_different_worker(destinations)
-    if not isinstance(value, values.DistributedValues):
-      # pylint: disable=protected-access
-      return cross_device_ops_lib.reduce_non_distributed_value(
-          reduce_op, self._device_map, value, destinations)
-    return self._cross_device_ops.reduce(
-        reduce_op, value, destinations=destinations)
-
-  def _batch_reduce_to(self, reduce_op, value_destination_pairs):
-    for _, destinations in value_destination_pairs:
-      self._verify_destinations_not_different_worker(destinations)
-    return self._cross_device_ops.batch_reduce(reduce_op,
-                                               value_destination_pairs)
-
-  def _select_single_value(self, structured):
-    """Select any single values in `structured`."""
-
-    def _select_fn(x):  # pylint: disable=g-missing-docstring
-      if isinstance(x, values.Mirrored):
-        if len(x.devices) == 1:
-          return x.primary
-        else:
-          raise ValueError(
-              "You cannot update variable with a Mirrored object with multiple "
-              "components %r when using ParameterServerStrategy. You must "
-              "specify a single value or a Mirrored with a single value." % x)
-      elif isinstance(x, values.PerReplica):
-        raise ValueError(
-            "You cannot update variable with a PerReplica object %r when using "
-            "ParameterServerStrategy. You must specify a single value or a "
-            "Mirrored with a single value" % x)
-      else:
-        return x
-
-    return nest.map_structure(_select_fn, structured)
-
-  def _update(self, var, fn, args, kwargs, group):
-    if isinstance(var, values.AggregatingVariable):
-      var = var.get()
-    if not isinstance(var, resource_variable_ops.ResourceVariable):
-      raise ValueError(
-          "You can not update `var` %r. It must be a Variable." % var)
-    with ops.colocate_with(var), distribute_lib.UpdateContext(var.device):
-      result = fn(var, *self._select_single_value(args),
-                  **self._select_single_value(kwargs))
-      if group:
-        return result
-      else:
-        return nest.map_structure(self._unwrap, result)
-
-  # TODO(yuefengz): does it need to call _select_single_value?
-  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
-    with ops.device(
-        colocate_with.device), distribute_lib.UpdateContext(colocate_with):
-      result = fn(*args, **kwargs)
-      if group:
-        return result
-      else:
-        return nest.map_structure(self._unwrap, result)
-
-  def _unwrap(self, val):
-    if isinstance(val, values.DistributedValues):
-      return val.values
-    return (val,)
-
-  def value_container(self, val):
-    if (hasattr(val, "_aggregating_container") and
-        not isinstance(val, values.AggregatingVariable)):
-      wrapper = val._aggregating_container()  # pylint: disable=protected-access
-      if wrapper is not None:
-        return wrapper
-    return val
-
-  def read_var(self, var):
-    # No need to distinguish between normal variables and replica-local
-    # variables.
-    return array_ops.identity(var)
-
-  def _configure(self,
-                 session_config=None,
-                 cluster_spec=None,
-                 task_type=None,
-                 task_id=None):
-    """Configures the strategy class.
-
-    The strategy object will be re-initialized if `cluster_spec` is given but
-    was not passed in the constructor.
+    NOTE: The `batch_size` argument here has different behavior for this
+    contrib version of `ParameterServerStrategy`.
 
     Args:
-      session_config: not used currently.
-      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
-        cluster configurations.
-      task_type: the current task type.
-      task_id: the current task id.
-
-    Raises:
-      ValueError: if `cluster_spec` is given but `task_type` or `task_id` is
-        not.
+      numpy_input: A nest of NumPy input arrays that will be distributed evenly
+        across all replicas.
+      batch_size: The number of entries from the array we should consume in one
+        step of the computation, across all replicas. This is the per-replica
+        batch size. The global batch size will be this times
+        `num_replicas_in_sync`.
+      num_epochs: The number of times to iterate through the examples. A value
+        of `None` means repeat forever.
+      shuffle: Size of buffer to use for shuffling the input examples.
+        Use `None` to disable shuffling.
+      session: (TensorFlow v1.x graph execution only) A session used for
+        initialization.
+
+    Returns:
+      An `tf.distribute.InputIterator` which returns inputs for each step of the
+      computation.  User should call `initialize` on the returned iterator.
     """
-    if not self._cluster_spec and cluster_spec:
-      # If a `cluster_spec` is already passed in, do nothing here.
-      # TODO(yuefengz): check `cluster_spec` is the same if this object has
-      # already been initialized with a `cluster_spec`.
-      if task_type is None or task_id is None:
-        raise ValueError("When `cluster_spec` is given, must also specify "
-                         "`task_type` and `task_id`.")
-      self._cluster_spec = multi_worker_util.normalize_cluster_spec(
-          cluster_spec)
-      self._task_type = task_type
-      self._task_id = task_id
-      self._initialize_multi_worker(self._num_gpus_per_worker,
-                                    self._cluster_spec, task_type, task_id)
-
-    if session_config:
-      session_config.CopyFrom(self._update_config_proto(session_config))
-
-  def _update_config_proto(self, config_proto):
-    updated_config = copy.deepcopy(config_proto)
-    if not self._cluster_spec:
-      updated_config.isolate_session_state = True
-      return updated_config
-
-    updated_config.isolate_session_state = False
+    return super(ParameterServerStrategy,
+                 self).experimental_make_numpy_iterator(
+                     numpy_input, batch_size, num_epochs, shuffle, session)
 
-    assert self._task_type
-    assert self._task_id is not None
 
-    # The device filters prevent communication between workers.
-    if self._task_type not in ["chief", "worker"]:
-      return updated_config
-    del updated_config.device_filters[:]
-    updated_config.device_filters.extend(
-        ["/job:%s/task:%d" % (self._task_type, self._task_id), "/job:ps"])
-    return updated_config
-
-  @property
-  def _num_replicas_in_sync(self):
-    return self._device_map.num_replicas_in_graph
-
-  @property
-  def worker_devices(self):
-    return self._device_map.all_devices
-
-  @property
-  def worker_devices_by_replica(self):
-    return self._device_map.devices_by_replica
-
-  @property
-  def parameter_devices(self):
-    return self._parameter_devices
-
-  def non_slot_devices(self, var_list):
-    return min(var_list, key=lambda x: x.name)
-
-  @property
-  def experimental_between_graph(self):
-    # TODO(yuefengz): Should this return False in the local case?
-    return True
-
-  @property
-  def experimental_should_init(self):
-    return self._is_chief
+class ParameterServerExtended(CoreParameterServerExtended):
+  """Implementation of ParameterServerStrategy."""
 
-  @property
-  def should_checkpoint(self):
-    return self._is_chief
+  def __init__(self, container_strategy, num_gpus_per_worker):
+    # Use TFConfigClusterResolver to parse TF_CONFIG. We don't want to change
+    # the constructor's interface to allow customized cluster resolver. Use
+    # SimpleClusterResolver to override num_accelerators.
+    tfconfig = TFConfigClusterResolver()
+    cluster_resolver = SimpleClusterResolver(
+        cluster_spec=tfconfig.cluster_spec(),
+        task_type=tfconfig.task_type,
+        task_index=tfconfig.task_index,
+        num_accelerators=num_gpus_per_worker)
+    super(ParameterServerExtended, self).__init__(
+        container_strategy, cluster_resolver=cluster_resolver)
 
-  @property
-  def should_save_summary(self):
-    return self._is_chief
+  def _make_dataset_iterator(self, dataset):
+    return input_lib.DatasetIterator(dataset, self._input_workers)
 
   # TODO(priyag): Delete this once all strategies use global batch size.
   @property
   def _global_batch_size(self):
+    """The contrib version of PS strategy uses per-replica batch size."""
     return False
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
index ce7065f220541ce2437f4768c312365a35197ab4..802809e7c7ec037ca3238c3e615308c701368d2a 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
@@ -29,10 +29,13 @@ from tensorflow.contrib.distribute.python import strategy_test_lib
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import parameter_server_strategy as core_parameter_server_strategy
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import run_config
@@ -50,6 +53,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import training_util
+from tensorflow.python.training.server_lib import ClusterSpec
 
 CHIEF = run_config.TaskType.CHIEF
 WORKER = run_config.TaskType.WORKER
@@ -63,6 +67,57 @@ def _get_replica_id_integer():
   return replica_id
 
 
+class MockCoreParameterServerStrategy(distribute_lib.DistributionStrategy):
+  """Mock the strategy to allow cluster resolver as an argument."""
+
+  def __init__(self, cluster_resolver):
+    super(MockCoreParameterServerStrategy, self).__init__(
+        core_parameter_server_strategy.ParameterServerStrategyExtended(
+            self, cluster_resolver=cluster_resolver))
+
+
+def create_test_objects(cluster_spec=None,
+                        task_type=None,
+                        task_id=None,
+                        num_gpus=None,
+                        sess_config=None,
+                        use_core_strategy=False):
+  sess_config = sess_config or config_pb2.ConfigProto()
+  if num_gpus is None:
+    num_gpus = context.num_gpus()
+  if use_core_strategy:
+    if cluster_spec and task_type and task_id is not None:
+      cluster_resolver = SimpleClusterResolver(
+          cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
+          task_type=task_type,
+          task_index=task_id,
+          num_accelerators=num_gpus)
+      target = 'grpc://' + cluster_spec[WORKER][task_id]
+    else:
+      cluster_resolver = SimpleClusterResolver(
+          ClusterSpec({}), num_accelerators=num_gpus)
+      target = ''
+
+    distribution = MockCoreParameterServerStrategy(cluster_resolver)
+    sess_config = copy.deepcopy(sess_config)
+    sess_config = distribution.update_config_proto(sess_config)
+  else:
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=num_gpus)
+    if task_type:
+      sess_config = copy.deepcopy(sess_config)
+      distribution.configure(
+          session_config=sess_config,
+          cluster_spec=cluster_spec,
+          task_type=task_type,
+          task_id=task_id)
+      target = 'grpc://' + cluster_spec[WORKER][task_id]
+    else:
+      target = ''
+
+  return distribution, target, sess_config
+
+
 class ParameterServerStrategyTestBase(
     multi_worker_test_base.MultiWorkerTestBase):
 
@@ -76,24 +131,27 @@ class ParameterServerStrategyTestBase(
     self._sess_config = config_pb2.ConfigProto(allow_soft_placement=True)
     super(ParameterServerStrategyTestBase, self).setUp()
 
-  def _get_test_objects(self, task_type, task_id, num_gpus):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=num_gpus)
-    if not task_type:
-      return distribution, '', self._sess_config
-
-    sess_config = copy.deepcopy(self._sess_config)
-    distribution.configure(
-        session_config=sess_config,
+  def _get_test_objects(self,
+                        task_type,
+                        task_id,
+                        num_gpus,
+                        use_core_strategy=False):
+    return create_test_objects(
         cluster_spec=self._cluster_spec,
         task_type=task_type,
-        task_id=task_id)
-    return (distribution, 'grpc://' + self._cluster_spec[WORKER][task_id],
-            sess_config)
-
-  def _test_device_assignment_distributed(self, task_type, task_id, num_gpus):
+        task_id=task_id,
+        num_gpus=num_gpus,
+        sess_config=self._sess_config,
+        use_core_strategy=use_core_strategy)
+
+  def _test_device_assignment_distributed(self,
+                                          task_type,
+                                          task_id,
+                                          num_gpus,
+                                          use_core_strategy=False):
     worker_device = '/job:%s/replica:0/task:%d' % (task_type, task_id)
-    d, _, sess_config = self._get_test_objects(task_type, task_id, num_gpus)
+    d, _, sess_config = self._get_test_objects(
+        task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
     with ops.Graph().as_default(), \
          self.cached_session(target=self._default_target,
                              config=sess_config) as sess, \
@@ -191,8 +249,9 @@ class ParameterServerStrategyTestBase(
         self.assertEqual(f_val, 46.0)
 
   def _test_device_assignment_distributed_enable_partitioner(
-      self, task_type, task_id, num_gpus):
-    d, _, sess_config = self._get_test_objects(task_type, task_id, num_gpus)
+      self, task_type, task_id, num_gpus, use_core_strategy=False):
+    d, _, sess_config = self._get_test_objects(
+        task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
     num_shards = len(d.parameter_devices)
     partitioner = partitioned_variables.fixed_size_partitioner(num_shards)
     with ops.Graph().as_default(), \
@@ -340,9 +399,13 @@ class ParameterServerStrategyTestBase(
         self.assertEqual(z_val, 43.0)
         self.assertEqual(f_val, 46.0)
 
-  def _test_simple_increment(self, task_type, task_id, num_gpus):
+  def _test_simple_increment(self,
+                             task_type,
+                             task_id,
+                             num_gpus,
+                             use_core_strategy=False):
     d, master_target, sess_config = self._get_test_objects(
-        task_type, task_id, num_gpus)
+        task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
     if d.extended._cluster_spec:
       num_workers = len(d.extended._cluster_spec.as_dict().get(WORKER))
       if 'chief' in d.extended._cluster_spec.as_dict():
@@ -410,9 +473,13 @@ class ParameterServerStrategyTestBase(
               y_val == 20.0 + 1.0 * num_workers * d.num_replicas_in_sync and
               z_val == 30.0 + 1.0 * num_workers)
 
-  def _test_minimize_loss_graph(self, task_type, task_id, num_gpus):
+  def _test_minimize_loss_graph(self,
+                                task_type,
+                                task_id,
+                                num_gpus,
+                                use_core_strategy=False):
     d, master_target, sess_config = self._get_test_objects(
-        task_type, task_id, num_gpus)
+        task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
     if task_type:
       # Multi-worker
       assert hasattr(d.extended, '_cluster_spec') and d.extended._cluster_spec
@@ -498,10 +565,15 @@ class ParameterServerStrategyTestBase(
       self.assertLess(error_after, error_before)
       return error_after < error_before
 
-  def _test_input_fn_iterator(self, task_type, task_id, num_gpus, input_fn,
-                              expected_values):
+  def _test_input_fn_iterator(self,
+                              task_type,
+                              task_id,
+                              num_gpus,
+                              input_fn,
+                              expected_values,
+                              use_core_strategy=False):
     distribution, master_target, config = self._get_test_objects(
-        task_type, task_id, num_gpus)
+        task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
     devices = distribution.extended.worker_devices
 
     with ops.Graph().as_default(), \
@@ -531,9 +603,11 @@ class ParameterServerStrategyTestBase(
         self.assertEqual(expected_value, computed_value)
 
 
-class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
-                                  strategy_test_lib.DistributionTestBase,
-                                  parameterized.TestCase):
+class ParameterServerStrategyTest(
+    ParameterServerStrategyTestBase,
+    strategy_test_lib.DistributionTestBase,
+    strategy_test_lib.TwoDeviceDistributionTestBase,
+    parameterized.TestCase):
 
   @classmethod
   def setUpClass(cls):
@@ -541,66 +615,93 @@ class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
         num_workers=3, num_ps=2)
     cls._default_target = 'grpc://' + cls._cluster_spec[WORKER][0]
 
-  def test_num_replicas_in_sync(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=2)
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def test_num_replicas_in_sync(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=2, use_core_strategy=use_core_strategy)
     # All the devices on a given worker are in sync which in this case is the
     # number of gpus on each worker.
-    self.assertEqual(2, distribution.num_replicas_in_sync)
+    self.assertEqual(2, strategy.num_replicas_in_sync)
 
-  def testDeviceAssignmentLocalCPU(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=0)
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testDeviceAssignmentLocalCPU(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=0, use_core_strategy=use_core_strategy)
     self._test_device_assignment_local(
-        distribution, compute_device='CPU', variable_device='CPU', num_gpus=0)
+        strategy, compute_device='CPU', variable_device='CPU', num_gpus=0)
 
-  def testDeviceAssignmentLocalOneGPU(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=1)
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testDeviceAssignmentLocalOneGPU(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=1, use_core_strategy=use_core_strategy)
     self._test_device_assignment_local(
-        distribution, compute_device='GPU', variable_device='GPU', num_gpus=1)
+        strategy, compute_device='GPU', variable_device='GPU', num_gpus=1)
 
-  def testDeviceAssignmentLocalTwoGPUs(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=2)
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testDeviceAssignmentLocalTwoGPUs(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=2, use_core_strategy=use_core_strategy)
     self._test_device_assignment_local(
-        distribution, compute_device='GPU', variable_device='CPU', num_gpus=2)
+        strategy, compute_device='GPU', variable_device='CPU', num_gpus=2)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
-  def testDeviceAssignmentDistributed(self, num_gpus):
-    self._test_device_assignment_distributed('worker', 1, num_gpus)
+      combinations.combine(
+          mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False]))
+  def testDeviceAssignmentDistributed(self, num_gpus, use_core_strategy):
+    self._test_device_assignment_distributed(
+        'worker', 1, num_gpus, use_core_strategy=use_core_strategy)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
-  def testDeviceAssignmentDistributedEnablePartitioner(self, num_gpus):
+      combinations.combine(
+          mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False]))
+  def testDeviceAssignmentDistributedEnablePartitioner(self, num_gpus,
+                                                       use_core_strategy):
     self._test_device_assignment_distributed_enable_partitioner(
-        'worker', 1, num_gpus)
+        'worker', 1, num_gpus, use_core_strategy=use_core_strategy)
 
-  def testSimpleBetweenGraph(self):
-    self._run_between_graph_clients(self._test_simple_increment,
-                                    self._cluster_spec, context.num_gpus())
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testSimpleBetweenGraph(self, use_core_strategy):
+    self._run_between_graph_clients(
+        self._test_simple_increment,
+        self._cluster_spec,
+        context.num_gpus(),
+        use_core_strategy=use_core_strategy)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
-  def testLocalSimpleIncrement(self, num_gpus):
-    self._test_simple_increment(None, 0, num_gpus)
+      combinations.combine(
+          mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False]))
+  def testLocalSimpleIncrement(self, num_gpus, use_core_strategy):
+    self._test_simple_increment(None, 0, num_gpus, use_core_strategy)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
-  def testMinimizeLossGraphDistributed(self, num_gpus):
-    self._run_between_graph_clients(self._test_minimize_loss_graph,
-                                    self._cluster_spec, num_gpus)
+      combinations.combine(
+          mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False]))
+  def testMinimizeLossGraphDistributed(self, num_gpus, use_core_strategy):
+    self._run_between_graph_clients(
+        self._test_minimize_loss_graph,
+        self._cluster_spec,
+        num_gpus,
+        use_core_strategy=use_core_strategy)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
-  def testMinimizeLossGraphLocal(self, num_gpus):
-    self._test_minimize_loss_graph(None, None, num_gpus)
+      combinations.combine(
+          mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False]))
+  def testMinimizeLossGraphLocal(self, num_gpus, use_core_strategy):
+    self._test_minimize_loss_graph(None, None, num_gpus, use_core_strategy)
 
   # TODO(priyag): Refactor this and other multi worker tests.
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[1, 2], required_gpus=1))
-  def testMakeInputFnIteratorDistributed(self, num_gpus):
+      combinations.combine(
+          mode=['graph'],
+          num_gpus=[1, 2],
+          required_gpus=1,
+          use_core_strategy=[True, False]))
+  def testMakeInputFnIteratorDistributed(self, num_gpus, use_core_strategy):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
     dataset_fn = lambda: dataset_ops.Dataset.range(100)
@@ -612,12 +713,21 @@ class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
         expected_num_replicas_in_sync=num_gpus,
         expected_num_input_pipelines=3,
         expected_input_pipeline_id=1)  # because task_id = 1
-    self._test_input_fn_iterator('worker', 1, num_gpus,
-                                 input_fn, expected_values)
+    self._test_input_fn_iterator(
+        'worker',
+        1,
+        num_gpus,
+        input_fn,
+        expected_values,
+        use_core_strategy=use_core_strategy)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[1, 2], required_gpus=1))
-  def testMakeInputFnIteratorLocal(self, num_gpus):
+      combinations.combine(
+          mode=['graph'],
+          num_gpus=[1, 2],
+          required_gpus=1,
+          use_core_strategy=[True, False]))
+  def testMakeInputFnIteratorLocal(self, num_gpus, use_core_strategy):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
     dataset_fn = lambda: dataset_ops.Dataset.range(100)
@@ -629,23 +739,31 @@ class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
         expected_num_replicas_in_sync=num_gpus,
         expected_num_input_pipelines=1,
         expected_input_pipeline_id=0)  # only one worker and pipeline for local.
-    self._test_input_fn_iterator(None, None, num_gpus,
-                                 input_fn, expected_values)
+    self._test_input_fn_iterator(
+        None,
+        None,
+        num_gpus,
+        input_fn,
+        expected_values,
+        use_core_strategy=use_core_strategy)
 
-  def testGlobalStepUpdate(self):
-    strategy = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=context.num_gpus())
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testGlobalStepUpdate(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(use_core_strategy=use_core_strategy)
     self._test_global_step_update(strategy)
 
-  def testUpdateConfigProtoMultiWorker(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=2)
-    distribution.configure(
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testUpdateConfigProtoMultiWorker(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=2, use_core_strategy=use_core_strategy)
+    strategy.configure(
         cluster_spec=self._cluster_spec, task_type='worker', task_id=1)
 
     config_proto = config_pb2.ConfigProto(device_filters=['to_be_overridden'])
 
-    new_config = distribution.update_config_proto(config_proto)
+    new_config = strategy.update_config_proto(config_proto)
 
     # Verify device filters.
     self.assertEqual(['/job:worker/task:1', '/job:ps'],
@@ -654,16 +772,48 @@ class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
     # Verify isolate_session_state
     self.assertFalse(new_config.isolate_session_state)
 
-  def testUpdateConfigProtoLocal(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=2)
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testUpdateConfigProtoLocal(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=2, use_core_strategy=use_core_strategy)
 
     config_proto = config_pb2.ConfigProto()
-    new_config = distribution.update_config_proto(config_proto)
+    new_config = strategy.update_config_proto(config_proto)
 
     # Verify isolate_session_state
     self.assertTrue(new_config.isolate_session_state)
 
+  def testAllReduceSum(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    self._test_all_reduce_sum(distribution)
+
+  def testAllReduceSumGradients(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    self._test_all_reduce_sum_gradients(distribution)
+
+  def testAllReduceSumGradientTape(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    self._test_all_reduce_sum_gradient_tape(distribution)
+
+  def testAllReduceMean(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    self._test_all_reduce_mean(distribution)
+
+  def testAllReduceMeanGradients(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    self._test_all_reduce_mean_gradients(distribution)
+
+  def testAllReduceMeanGradientTape(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    self._test_all_reduce_mean_gradient_tape(distribution)
+
 
 class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
                                            parameterized.TestCase):
@@ -674,20 +824,31 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
         num_workers=3, num_ps=2, has_chief=True)
     cls._default_target = 'grpc://' + cls._cluster_spec[CHIEF][0]
 
-  def testSimpleBetweenGraph(self):
-    self._run_between_graph_clients(self._test_simple_increment,
-                                    self._cluster_spec, context.num_gpus())
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testSimpleBetweenGraph(self, use_core_strategy):
+    self._run_between_graph_clients(
+        self._test_simple_increment,
+        self._cluster_spec,
+        context.num_gpus(),
+        use_core_strategy=use_core_strategy)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
-  def testMinimizeLossGraph(self, num_gpus):
-    self._run_between_graph_clients(self._test_minimize_loss_graph,
-                                    self._cluster_spec, num_gpus)
+      combinations.combine(
+          mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False]))
+  def testMinimizeLossGraph(self, num_gpus, use_core_strategy):
+    self._run_between_graph_clients(
+        self._test_minimize_loss_graph,
+        self._cluster_spec,
+        num_gpus,
+        use_core_strategy=use_core_strategy)
 
-  def testGlobalStepIsWrappedOnTwoGPUs(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=2)
-    with ops.Graph().as_default(), distribution.scope():
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testGlobalStepIsWrappedOnTwoGPUs(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=2, use_core_strategy=use_core_strategy)
+    with ops.Graph().as_default(), strategy.scope():
       created_step = training_util.create_global_step()
       get_step = training_util.get_global_step()
       self.assertEqual(created_step, get_step,
@@ -696,12 +857,14 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
                              id(get_step), get_step.__class__.__name__)))
       self.assertIs(values.AggregatingVariable, type(created_step))
       self.assertIs(values.AggregatingVariable, type(get_step))
-      self.assertIs(distribution, created_step.distribute_strategy)
+      self.assertIs(strategy, created_step.distribute_strategy)
 
-  def testGlobalStepIsNotWrappedOnOneGPU(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=1)
-    with ops.Graph().as_default(), distribution.scope():
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testGlobalStepIsNotWrappedOnOneGPU(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=1, use_core_strategy=use_core_strategy)
+    with ops.Graph().as_default(), strategy.scope():
       created_step = training_util.create_global_step()
       get_step = training_util.get_global_step()
       self.assertEqual(created_step, get_step,
@@ -710,20 +873,36 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
                              id(get_step), get_step.__class__.__name__)))
       self.assertIs(resource_variable_ops.ResourceVariable, type(created_step))
       self.assertIs(resource_variable_ops.ResourceVariable, type(get_step))
-      self.assertIs(distribution, created_step.distribute_strategy)
+      self.assertIs(strategy, created_step.distribute_strategy)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testValueContainer(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=2, use_core_strategy=use_core_strategy)
+    with ops.Graph().as_default(), strategy.scope():
 
-  def testValueContainer(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=2)
-    with ops.Graph().as_default(), distribution.scope():
       def f():
         with backprop.GradientTape() as tape:
           v = variable_scope.get_variable('v', initializer=10.0)
           _ = v * v
         v, = tape.watched_variables()
-        w = distribution.extended.value_container(v)
+        w = strategy.extended.value_container(v)
         self.assertIs(values.AggregatingVariable, type(w))
-      distribution.extended.call_for_each_replica(f)
+
+      strategy.extended.call_for_each_replica(f)
+
+
+class LocalParameterServerStrategyTest(strategy_test_lib.DistributionTestBase,
+                                       parameterized.TestCase):
+
+  @combinations.generate(combinations.combine(mode=['graph', 'eager'],
+                                              use_core_strategy=[True, False],
+                                              required_gpus=2))
+  def testNumpyIterator(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=2, use_core_strategy=use_core_strategy)
+    self._test_numpy_iterator(strategy)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/distribute/python/step_fn_test.py b/tensorflow/contrib/distribute/python/step_fn_test.py
index 1ff9b9ceec13351b098d47ed3ff62f689a625a31..a77d6d0bec85d321666f9c4043d22fc6a0c37158 100644
--- a/tensorflow/contrib/distribute/python/step_fn_test.py
+++ b/tensorflow/contrib/distribute/python/step_fn_test.py
@@ -45,7 +45,6 @@ class SingleLossStepTest(test.TestCase, parameterized.TestCase):
       single_loss_step, layer = single_loss_example(
           optimizer_fn, distribution, use_bias=True, iterations_per_step=2)
 
-      self.evaluate(distribution.initialize())
       if context.executing_eagerly():
         run_step = single_loss_step
       else:
@@ -57,12 +56,9 @@ class SingleLossStepTest(test.TestCase, parameterized.TestCase):
       weights, biases = [], []
       for _ in range(5):
         run_step()
-
         weights.append(self.evaluate(layer.kernel))
         biases.append(self.evaluate(layer.bias))
 
-      self.evaluate(distribution.finalize())
-
       error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
       is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
       self.assertTrue(is_not_increasing)
diff --git a/tensorflow/contrib/distribute/python/strategy_test_lib.py b/tensorflow/contrib/distribute/python/strategy_test_lib.py
index 6e5280e35632d3f3cb6a4fe172a15fb7f508354c..7455cbd02a2571ed3bae81864580440c982e5f7b 100644
--- a/tensorflow/contrib/distribute/python/strategy_test_lib.py
+++ b/tensorflow/contrib/distribute/python/strategy_test_lib.py
@@ -18,7 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values
@@ -31,6 +34,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.layers import core
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -292,3 +296,190 @@ class DistributionTestBase(test.TestCase):
       global_step_tensors = strategy.unwrap(value)
       global_step_values = self.evaluate(global_step_tensors)
       self.assertEqual((1,) * len(global_step_tensors), global_step_values)
+
+  def _test_numpy_iterator(self, strategy):
+    with strategy.scope(), self.cached_session() as sess:
+      x = np.asarray([[1, 2], [6, 12], [2, 4],
+                      [5, 10], [3, 6], [4, 8]])
+      y = np.asarray([5, 4, 3, 2, 1, 0])
+      batch_size = 6
+      if not strategy.extended._global_batch_size:  # pylint: disable=protected-access
+        batch_size = batch_size // strategy.num_replicas_in_sync
+      i = strategy.experimental_make_numpy_iterator(
+          (x, y), batch_size=batch_size, num_epochs=2, shuffle=None,
+          session=sess)
+      self.evaluate(i.initialize())
+
+      def run_and_concatenate(strategy, i):
+        x, y = strategy.experimental_run(lambda z: z, i)
+        x, y = self.evaluate((strategy.unwrap(x), strategy.unwrap(y)))
+        return np.concatenate(x), np.concatenate(y)
+
+      x_1, y_1 = run_and_concatenate(strategy, i)
+      self.assertAllEqual(x, x_1)
+      self.assertAllEqual(y, y_1)
+      x_2, y_2 = run_and_concatenate(strategy, i)
+      self.assertAllEqual(x, x_2)
+      self.assertAllEqual(y, y_2)
+      with self.assertRaises(errors.OutOfRangeError):
+        run_and_concatenate(strategy, i)
+
+
+class OneDeviceDistributionTestBase(test.TestCase):
+  """Some tests that should work with any one-device DistributionStrategy."""
+
+  def _test_all_reduce_sum(self, strategy):
+    self._test_collective_comms(
+        strategy, _all_sum, inputs=(4., [42., 43.]), expected=(4., [42., 43.]))
+
+  def _test_all_reduce_sum_gradients(self, strategy):
+    self._test_collective_comms_gradients(
+        strategy, _all_sum, inputs=[4.], expected_grads=[4.])
+
+  def _test_all_reduce_sum_gradient_tape(self, strategy):
+    self._test_collective_comms_gradient_tape(
+        strategy, _all_sum, inputs=[4.], expected_grads=[4.])
+
+  def _test_all_reduce_mean(self, strategy):
+    self._test_collective_comms(
+        strategy, _all_mean, inputs=(2., [21., 22.]), expected=(2., [21., 22.]))
+
+  def _test_all_reduce_mean_gradients(self, strategy):
+    self._test_collective_comms_gradients(
+        strategy, _all_mean, inputs=[5.], expected_grads=[5.])
+
+  def _test_all_reduce_mean_gradient_tape(self, strategy):
+    self._test_collective_comms_gradient_tape(
+        strategy, _all_mean, inputs=[5.], expected_grads=[5.])
+
+  def _test_collective_comms(self, strategy, comm_fn, inputs, expected):
+    inputs = strategy.make_input_fn_iterator(
+        lambda _: dataset_ops.Dataset.from_tensors(inputs))
+
+    self.evaluate(inputs.initialize())
+    outputs = self.evaluate(
+        list(map(strategy.unwrap, strategy.experimental_run(comm_fn, inputs))))
+    self.assertAllEqual([expected[0]], outputs[0])
+    self.assertAllEqual([expected[1]], outputs[1])
+
+  def _test_collective_comms_gradients(
+      self, strategy, comm_fn, inputs, expected_grads):
+    if context.executing_eagerly():
+      self.skipTest("`tf.gradients` is not supported with eager execution.")
+
+    def step(c):
+      x = constant_op.constant(42.)
+      y = comm_fn(x) * c
+      return gradients_impl.gradients(y, [x])[0]
+
+    inputs = strategy.make_input_fn_iterator(
+        lambda _: dataset_ops.Dataset.from_tensors(inputs))
+
+    self.evaluate(inputs.initialize())
+    self.assertAllEqual(
+        expected_grads,
+        self.evaluate(strategy.unwrap(strategy.experimental_run(step, inputs))))
+
+  def _test_collective_comms_gradient_tape(
+      self, strategy, comm_fn, inputs, expected_grads):
+    def step(c):
+      x = constant_op.constant(42.)
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        y = comm_fn(x) * c
+      return tape.gradient(y, x)
+
+    inputs = strategy.make_input_fn_iterator(
+        lambda _: dataset_ops.Dataset.from_tensors(inputs))
+
+    self.evaluate(inputs.initialize())
+    self.assertAllEqual(
+        expected_grads,
+        self.evaluate(strategy.unwrap(strategy.experimental_run(step, inputs))))
+
+
+class TwoDeviceDistributionTestBase(test.TestCase):
+  """Some tests that should work with any two-device DistributionStrategy."""
+
+  def _test_all_reduce_sum(self, strategy):
+    self._test_collective_comms(
+        strategy, _all_sum,
+        inputs=([1., 3.], [[39., 2.], [3., 41.]]),
+        expected=(4., [42., 43.]))
+
+  def _test_all_reduce_sum_gradients(self, strategy):
+    self._test_collective_comms_gradients(
+        strategy, _all_sum, inputs=[1., 3.], expected_grads=[4., 4.])
+
+  def _test_all_reduce_sum_gradient_tape(self, strategy):
+    self._test_collective_comms_gradient_tape(
+        strategy, _all_sum, inputs=[1., 3.], expected_grads=[4., 4.])
+
+  def _test_all_reduce_mean(self, strategy):
+    self._test_collective_comms(
+        strategy, _all_mean,
+        inputs=([1., 3.], [[39., 2.], [3., 41.]]),
+        expected=(2., [21., 21.5]))
+
+  def _test_all_reduce_mean_gradients(self, strategy):
+    self._test_collective_comms_gradients(
+        strategy, _all_mean, inputs=[1., 3.], expected_grads=[2., 2.])
+
+  def _test_all_reduce_mean_gradient_tape(self, strategy):
+    self._test_collective_comms_gradient_tape(
+        strategy, _all_mean, inputs=[1., 3.], expected_grads=[2., 2.])
+
+  def _test_collective_comms(self, strategy, comm_fn, inputs, expected):
+    inputs = strategy.make_input_fn_iterator(
+        lambda _: dataset_ops.Dataset.from_tensor_slices(inputs))
+
+    self.evaluate(inputs.initialize())
+    outputs = self.evaluate(
+        list(map(strategy.unwrap, strategy.experimental_run(comm_fn, inputs))))
+    self.assertAllEqual([expected[0], expected[0]], outputs[0])
+    self.assertAllEqual([expected[1], expected[1]], outputs[1])
+
+  def _test_collective_comms_gradients(
+      self, strategy, comm_fn, inputs, expected_grads):
+    if context.executing_eagerly():
+      self.skipTest("`tf.gradients` is not supported with eager execution.")
+
+    def step(c):
+      x = constant_op.constant(42.)
+      y = comm_fn(x) * c
+      return gradients_impl.gradients(y, [x])[0]
+
+    inputs = strategy.make_input_fn_iterator(
+        lambda _: dataset_ops.Dataset.from_tensor_slices(inputs))
+
+    self.evaluate(inputs.initialize())
+    self.assertAllEqual(
+        expected_grads,
+        self.evaluate(strategy.unwrap(strategy.experimental_run(step, inputs))))
+
+  def _test_collective_comms_gradient_tape(
+      self, strategy, comm_fn, inputs, expected_grads):
+    def step(c):
+      x = constant_op.constant(42.)
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        y = comm_fn(x) * c
+      return tape.gradient(y, x)
+
+    inputs = strategy.make_input_fn_iterator(
+        lambda _: dataset_ops.Dataset.from_tensor_slices(inputs))
+
+    self.evaluate(inputs.initialize())
+    self.assertAllEqual(
+        expected_grads,
+        self.evaluate(strategy.unwrap(strategy.experimental_run(step, inputs))))
+
+
+def _all_sum(value):
+  ctx = ds_context.get_replica_context()
+  return ctx.all_reduce(reduce_util.ReduceOp.SUM, value)
+
+
+def _all_mean(value):
+  ctx = ds_context.get_replica_context()
+  return ctx.all_reduce(reduce_util.ReduceOp.MEAN, value)
diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
index 9e465f30c1b14dd1602919442aaffc5991ddeaf4..518c704b8990fbadab6a9707dd10611fd878e8ce 100644
--- a/tensorflow/contrib/distribute/python/tpu_strategy.py
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -33,6 +33,8 @@ from tensorflow.python.client import session as session_lib
 from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
+from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver as resolver_lib
@@ -50,6 +52,26 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
 
+def initialize_tpu_system(cluster_resolver=None):
+  """Initialize the TPU devices in a separate session and graph.
+
+  Args:
+    cluster_resolver: A tf.contrib.cluster_resolver.TPUClusterResolver,
+        which provides information about the TPU cluster.
+  """
+  if cluster_resolver is None:
+    cluster_resolver = resolver_lib.TPUClusterResolver("")
+  master = cluster_resolver.master()
+
+  logging.info("Initializing the TPU system.")
+  session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+
+  with ops.Graph().as_default():
+    with session_lib.Session(config=session_config, target=master) as sess:
+      sess.run([tpu.initialize_system()])
+  logging.info("Finished initializing TPU system.")
+
+
 def get_tpu_system_metadata(tpu_cluster_resolver):
   """Retrieves TPU system metadata given a TPUClusterResolver."""
   master = tpu_cluster_resolver.master()
@@ -126,7 +148,8 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
 
   def __init__(self,
                tpu_cluster_resolver=None,
-               steps_per_run=None):
+               steps_per_run=None,
+               **kwargs):
     """Initializes the TPUStrategy object.
 
     Args:
@@ -137,10 +160,22 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
           metrics, summaries etc.
           This parameter is only used when Distribution Strategy is used with
           estimator or keras.
+      **kwargs: Additional experimental flags. Will be removed in future.
     """
     super(TPUStrategy, self).__init__(TPUExtended(
         self, tpu_cluster_resolver, steps_per_run))
 
+    self._disable_training_loop_on_host = False
+    if len(kwargs) > 1:
+      raise ValueError("TPUStrategy constructor only takes one experimental "
+                       "flag now")
+    if len(kwargs) == 1:
+      if "_disable_training_loop_on_host" not in kwargs:
+        raise ValueError("TPUStrategy constructor does not support arguments: "
+                         "{}".format(kwargs))
+      self._disable_training_loop_on_host = (
+          kwargs["_disable_training_loop_on_host"])
+
   @property
   def steps_per_run(self):
     """DEPRECATED: use .extended.steps_per_run instead."""
@@ -150,11 +185,6 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
 class TPUExtended(distribute_lib.DistributionStrategyExtended):
   """Implementation of TPUStrategy."""
 
-  # Track what TPU devices have been initialized. This is *intentionally*
-  # shared across all instances of TPUExtended as we want to keep track of which
-  # devices are initialized globally.
-  _initialized_devices = []
-
   def __init__(self,
                container_strategy,
                tpu_cluster_resolver=None,
@@ -191,39 +221,14 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
         (self.get_host(hid), [self.get_host_cpu_device(hid)])
         for hid in range(self.num_hosts)
     ]
-    self._input_workers = values.InputWorkers(input_device_map, worker_devices)
+    self._input_workers = input_lib.InputWorkers(
+        input_device_map, worker_devices)
 
     # TODO(sourabhbajaj): Remove this once performance of running one step
     # at a time is comparable to multiple steps.
     self.steps_per_run = steps_per_run
     self._require_static_shapes = True
 
-    # Initialize the TPU devices.
-    self._initialize_tpu()
-
-  def _initialize_tpu(self):
-    """Initialize the TPU devices in a separate session and graph.
-
-    We keep track of all the TPU devices that we're initialized as we should
-    only be running TPU initialize once for the entire process.
-    """
-    master = self._tpu_cluster_resolver.master()
-    # Verify TPU has not already been initialized in this process.
-    if master in TPUExtended._initialized_devices:
-      logging.info("TPU master %s has already been initialized." % master)
-      return
-
-    logging.info("Initializing the TPU system.")
-    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
-    self._configure(session_config)
-    with ops.Graph().as_default():
-      with session_lib.Session(config=session_config, target=master) as sess:
-        sess.run([tpu.initialize_system()])
-    logging.info("Finized initializing TPU system.")
-
-    # Update Strategy state to make sure we can track device initialization.
-    TPUExtended._initialized_devices.append(master)
-
   def _validate_colocate_with_variable(self, colocate_with_variable):
     values.validate_colocate_tpu_variable(colocate_with_variable, self)
 
@@ -290,15 +295,19 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
 
   def _make_dataset_iterator(self, dataset):
     """Make iterators for each of the TPU hosts."""
-
-    return values.DatasetIterator(dataset, self._input_workers,
-                                  self._num_replicas_in_sync)
+    return input_lib.DatasetIterator(dataset, self._input_workers,
+                                     self._num_replicas_in_sync)
 
   def _distribute_dataset(self, dataset_fn):
-    return values.MultiWorkerDataset(
+    return input_lib.MultiWorkerDataset(
         functools.partial(self._call_dataset_fn, dataset_fn),
         self._input_workers)
 
+  def _experimental_make_numpy_dataset(self, numpy_input, session):
+    return numpy_dataset.one_host_numpy_dataset(
+        numpy_input, numpy_dataset.SingleDevice(self.get_host_cpu_device(0)),
+        session)
+
   # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
   # TODO(sourabhbajaj): Remove the initial_loop_values parameter when we have
   # a mechanism to infer the outputs of `fn`. Pending b/110550782.
@@ -326,10 +335,11 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
     if initial_loop_values is None:
       initial_loop_values = {}
     initial_loop_values = nest.flatten(initial_loop_values)
-    ctx = values.MultiStepContext()
+    ctx = input_lib.MultiStepContext()
 
-    def run_fn():
+    def run_fn(*args, **kwargs):
       """Single step on the TPU device."""
+      del args, kwargs
       fn_result = fn(ctx, dequeue_fn())
       flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
       if flat_last_step_outputs:
@@ -338,6 +348,9 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
       else:
         return fn_result
 
+    def iterate_on_tpu():
+      return training_loop.repeat(iterations, run_fn, initial_loop_values)
+
     # We capture the control_flow_context at this point, before we run `fn`
     # inside a while_loop and TPU replicate context. This is useful in cases
     # where we might need to exit these contexts and get back to the outer
@@ -347,56 +360,77 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
     self._outer_control_flow_context = (
         ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
 
-    def rewrite_fn(*args):
-      """The rewritten step fn running on TPU."""
-      del args
+    # pylint: disable=protected-access
+    if self._container_strategy()._disable_training_loop_on_host:
       replicate_inputs = [[]] * self._num_replicas_in_sync
-      replicate_outputs = tpu.replicate(run_fn, replicate_inputs)
-
-      # If run_fn has tensor outputs, tpu.replicate returns a list of list. We
-      # will flatten it in this case. If run_fn has no tensor outputs,
-      # tpu.replicate returns a list of no_ops, we will keep the output as it
-      # is.
-      if isinstance(replicate_outputs[0], list):
-        replicate_outputs = nest.flatten(replicate_outputs)
-
-      return replicate_outputs
-
-    # TODO(sourabhbajaj): The input to while loop should be based on the output
-    # type of the step_fn
-    assert isinstance(initial_loop_values, list)
-    initial_loop_values = initial_loop_values * self._num_replicas_in_sync
-
-    # Put the while loop op on host 0.
-    with ops.device(self.get_host_cpu_device(0)):
-      replicate_outputs = training_loop.repeat(iterations, rewrite_fn,
-                                               initial_loop_values)
+      replicate_outputs = tpu.replicate(iterate_on_tpu, replicate_inputs)
+    else:
+      def rewrite_fn(*args):
+        """The rewritten step fn running on TPU."""
+        del args
+        replicate_inputs = [[]] * self._num_replicas_in_sync
+        replicate_outputs = tpu.replicate(run_fn, replicate_inputs)
+
+        # If run_fn has tensor outputs, tpu.replicate returns a list of list. We
+        # will flatten it in this case. If run_fn has no tensor outputs,
+        # tpu.replicate returns a list of no_ops, we will keep the output as it
+        # is.
+        if isinstance(replicate_outputs[0], list):
+          replicate_outputs = nest.flatten(replicate_outputs)
+
+        return replicate_outputs
+
+      # TODO(sourabhbajaj): The input to while loop should be based on the
+      # output type of the step_fn
+      assert isinstance(initial_loop_values, list)
+      initial_loop_values = initial_loop_values * self._num_replicas_in_sync
+
+      # Put the while loop op on host 0.
+      with ops.device(self.get_host_cpu_device(0)):
+        replicate_outputs = training_loop.repeat(iterations, rewrite_fn,
+                                                 initial_loop_values)
 
     del self._outer_control_flow_context
     ctx.run_op = control_flow_ops.group(replicate_outputs, enqueue_ops)
 
-    if isinstance(replicate_outputs, list):
+    if self._container_strategy()._disable_training_loop_on_host:
       # Filter out any ops from the outputs, typically this would be the case
       # when there were no tensor outputs.
-      last_step_tensor_outputs = [
-          x for x in replicate_outputs if not isinstance(x, ops.Operation)
-      ]
-
-      # Outputs are currently of the structure (flattened)
-      # [output0_device0, output1_device0, output2_device0,
-      #  output0_device1, output1_device1, output2_device1,
-      #  ...]
+      last_step_tensor_outputs = [x for x in replicate_outputs
+                                  if not isinstance(x, ops.Operation)]
+
+      # Outputs are currently of the structure (grouped by device)
+      # [[output0_device0, output1_device0, output2_device0],
+      #  [output0_device1, output1_device1, output2_device1]]
       # Convert this to the following structure instead: (grouped by output)
       # [[output0_device0, output0_device1],
       #  [output1_device0, output1_device1],
       #  [output2_device0, output2_device1]]
-      output_num = len(last_step_tensor_outputs) // self._num_replicas_in_sync
-      last_step_tensor_outputs = [
-          last_step_tensor_outputs[i::output_num] for i in range(output_num)
-      ]
+      last_step_tensor_outputs = [list(x) for x in
+                                  zip(*last_step_tensor_outputs)]
     else:
-      # no tensors returned.
-      last_step_tensor_outputs = []
+      if isinstance(replicate_outputs, list):
+        # Filter out any ops from the outputs, typically this would be the case
+        # when there were no tensor outputs.
+        last_step_tensor_outputs = [
+            x for x in replicate_outputs if not isinstance(x, ops.Operation)
+        ]
+
+        # Outputs are currently of the structure (flattened)
+        # [output0_device0, output1_device0, output2_device0,
+        #  output0_device1, output1_device1, output2_device1,
+        #  ...]
+        # Convert this to the following structure instead: (grouped by output)
+        # [[output0_device0, output0_device1],
+        #  [output1_device0, output1_device1],
+        #  [output2_device0, output2_device1]]
+        output_num = len(last_step_tensor_outputs) // self._num_replicas_in_sync
+        last_step_tensor_outputs = [
+            last_step_tensor_outputs[i::output_num] for i in range(output_num)
+        ]
+      else:
+        # no tensors returned.
+        last_step_tensor_outputs = []
 
     # Convert replicate_outputs to the original dict structure of
     # last_step_outputs.
@@ -423,19 +457,13 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
     with _TPUReplicaContext(self._container_strategy()):
       return fn(*args, **kwargs)
 
-  def _initialize(self):
-    if context.executing_eagerly():
-      # TODO(priyag): Add appopriate call here when eager is supported for TPUs.
-      raise NotImplementedError("Eager mode not supported in TPUStrategy.")
-    else:
-      return []
+  def _experimental_initialize_system(self):
+    """Experimental method added to be used by Estimator.
 
-  def _finalize(self):
-    if context.executing_eagerly():
-      # TODO(priyag): Add appopriate call here when eager is supported for TPUs.
-      raise NotImplementedError("Eager mode not supported in TPUStrategy.")
-    else:
-      return []
+    This is a private method only to be used by Estimator. Other frameworks
+    should directly be calling `tf.contrib.distribute.initialize_tpu_system`
+    """
+    initialize_tpu_system(self._tpu_cluster_resolver)
 
   def _create_variable(self, next_creator, *args, **kwargs):
     """Create a TPUMirroredVariable. See `DistributionStrategy.scope`."""
@@ -443,6 +471,9 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
     if colocate_with is None:
       device_map = self._device_map
       logical_device = 0  # TODO(josh11b): Get logical device from scope here.
+    elif isinstance(colocate_with, numpy_dataset.SingleDevice):
+      with ops.device(colocate_with.device):
+        return next_creator(*args, **kwargs)
     else:
       device_map = colocate_with.device_map
       logical_device = colocate_with.logical_device
@@ -635,6 +666,14 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
   # TODO(priyag): Delete this once all strategies use global batch size.
   @property
   def _global_batch_size(self):
+    """`make_dataset_iterator` and `make_numpy_iterator` use global batch size.
+
+    `distribute_dataset` and `make_input_fn_iterator` assume per-replica
+    batching.
+
+    Returns:
+      Boolean.
+    """
     return True
 
 
diff --git a/tensorflow/contrib/distribute/python/values_test.py b/tensorflow/contrib/distribute/python/values_test.py
index 73efb524b93a367d98395d4e83ac4bf136318a27..51c58b0b2f3dc2ab63e22718825a471b8657f892 100644
--- a/tensorflow/contrib/distribute/python/values_test.py
+++ b/tensorflow/contrib/distribute/python/values_test.py
@@ -22,28 +22,20 @@ import os
 from absl.testing import parameterized
 
 from tensorflow.contrib.distribute.python import combinations
-from tensorflow.contrib.distribute.python import multi_worker_test_base
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.data.experimental.ops import batching
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import device_util
-from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training import saver as saver_lib
-from tensorflow.python.util import nest
 
 
 class DistributedValuesTest(test.TestCase):
@@ -354,444 +346,6 @@ class RegroupAndSelectDeviceTest(test.TestCase):
                                                merged_estimator_spec))
 
 
-class PerReplicaDatasetTest(test.TestCase):
-
-  config = config_pb2.ConfigProto()
-  config.allow_soft_placement = True
-
-  def _test_iterator(self, devices, dataset, expected_values):
-    device_map = values.ReplicaDeviceMap(devices)
-    input_workers = values.InputWorkers(device_map)
-    per_replica_dataset = values.PerReplicaDataset(dataset, input_workers, 0)
-    if context.executing_eagerly():
-      iterator = per_replica_dataset.make_one_shot_iterator()
-    else:
-      iterator = per_replica_dataset.make_initializable_iterator()
-      self.evaluate([iterator.initializer])
-
-    for expected_value in expected_values:
-      next_element = iterator.get_next_as_list()
-      computed_value = self.evaluate(next_element)
-      self.assertEqual(expected_value, computed_value)
-
-    with self.assertRaises(errors.OutOfRangeError):
-      next_element = iterator.get_next_as_list()
-      self.evaluate(next_element)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testOneDevice(self):
-    devices = ["/device:CPU:0"]
-    dataset = dataset_ops.Dataset.range(10)
-
-    expected_values = [[i] for i in range(10)]
-
-    self._test_iterator(devices, dataset, expected_values)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testMultipleDevices(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dataset = dataset_ops.Dataset.range(10)
-
-    expected_values = [[i, i+1] for i in range(0, 10, 2)]
-
-    self._test_iterator(devices, dataset, expected_values)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testTupleDataset(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dataset1 = dataset_ops.Dataset.range(10)
-    dataset2 = dataset_ops.Dataset.range(10).map(lambda x: x**2)
-    dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
-
-    expected_values = [[(i, i**2), (i+1, (i+1)**2)] for i in range(0, 10, 2)]
-
-    self._test_iterator(devices, dataset, expected_values)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testUnevenDatasetBatches(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dataset = dataset_ops.Dataset.range(11)
-
-    expected_values = [[i, i+1] for i in range(0, 10, 2)]
-    self._test_iterator(devices, dataset, expected_values)
-
-  def testInitializableIterator(self):
-    with context.graph_mode():
-      devices = ["/device:CPU:0"]
-      # Using random input since that is only allowed with initializable
-      # iterator.
-      dataset = dataset_ops.Dataset.from_tensor_slices(
-          random_ops.random_uniform((10,)))
-
-      device_map = values.ReplicaDeviceMap(devices)
-      input_workers = values.InputWorkers(device_map)
-      per_replica_dataset = values.PerReplicaDataset(dataset, input_workers, 0)
-      iterator = per_replica_dataset.make_initializable_iterator()
-
-      self.evaluate(iterator.initializer)
-      next_element = iterator.get_next_as_list()
-      for _ in range(10):
-        self.evaluate(next_element)
-
-      # Should fail after the input is finished.
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
-
-      # After re-initializing the iterator, should be able to iterate again.
-      self.evaluate(iterator.initializer)
-      for _ in range(10):
-        self.evaluate(next_element)
-
-
-class MultiWorkerDatasetTest(multi_worker_test_base.MultiWorkerTestBase):
-
-  def _test_iterator(self, sess, iterator, devices, expected_values):
-    next_element = iterator.get_next()
-    for r, device in enumerate(devices):
-      v = values.select_replica(r, next_element)
-      # The `v` here can be a tuple.
-      for element in nest.flatten(v):
-        self.assertTrue(element.device in device)
-
-    for expected_value in expected_values:
-      t = [values.select_replica(r, next_element) for r in range(len(devices))]
-      actual = sess.run(t)
-      self.assertEqual(expected_value, actual)
-
-    with self.assertRaises(errors.OutOfRangeError):
-      sess.run([values.select_replica(r, next_element)
-                for r in range(len(devices))])
-
-  def _test_dataset(self, dataset_fn, worker_devices, devices,
-                    expected_values):
-    device_map = values.ReplicaDeviceMap(devices)
-    input_workers = values.InputWorkers(device_map, worker_devices)
-    multi_worker_dataset = values.MultiWorkerDataset(
-        dataset_fn, input_workers)
-    multi_worker_iterator = multi_worker_dataset.make_initializable_iterator()
-    with self.cached_session() as sess:
-      sess.run(multi_worker_iterator.initializer)
-      self._test_iterator(sess, multi_worker_iterator, devices, expected_values)
-
-  def _cpu_devices(self):
-    worker_devices = (
-        ("/job:worker/replica:0/task:0",
-         ["/job:worker/replica:0/task:0/device:CPU:0"]),
-        ("/job:worker/replica:0/task:1",
-         ["/job:worker/replica:0/task:1/device:CPU:0"])
-    )
-    devices = [
-        "/job:worker/replica:0/task:0/device:CPU:0",
-        "/job:worker/replica:0/task:1/device:CPU:0"
-    ]
-    return worker_devices, devices
-
-  def _cpu_and_one_gpu_devices(self):
-    worker_devices = (
-        ("/job:worker/replica:0/task:0", (
-            "/job:worker/replica:0/task:0/device:GPU:0",
-            "/job:worker/replica:0/task:0/device:CPU:0"
-        )),
-        ("/job:worker/replica:0/task:1", (
-            "/job:worker/replica:0/task:1/device:GPU:0",
-            "/job:worker/replica:0/task:1/device:CPU:0"
-        ))
-    )
-    devices = [
-        "/job:worker/replica:0/task:0/device:GPU:0",
-        "/job:worker/replica:0/task:0/device:CPU:0",
-        "/job:worker/replica:0/task:1/device:GPU:0",
-        "/job:worker/replica:0/task:1/device:CPU:0"
-    ]
-    return worker_devices, devices
-
-  def testDataDistributionOneDevicePerWorker(self):
-    worker_devices, devices = self._cpu_devices()
-    with context.graph_mode():
-      dataset_fn = lambda: dataset_ops.Dataset.range(8)
-      self._test_dataset(
-          dataset_fn, worker_devices, devices,
-          [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]])
-
-  def testDataDistributionTwoDevicePerWorker(self):
-    if context.num_gpus() < 1:
-      self.skipTest("A GPU is not available for this test.")
-    worker_devices, devices = self._cpu_and_one_gpu_devices()
-    with context.graph_mode():
-      dataset_fn = lambda: dataset_ops.Dataset.range(8)
-      self._test_dataset(
-          dataset_fn, worker_devices, devices,
-          [[0, 1, 0, 1], [2, 3, 2, 3], [4, 5, 4, 5], [6, 7, 6, 7]])
-
-  def testTupleDataset(self):
-    worker_devices, devices = self._cpu_devices()
-
-    with context.graph_mode():
-
-      def dataset_fn():
-        dataset1 = dataset_ops.Dataset.range(8)
-        dataset2 = dataset_ops.Dataset.range(8).map(lambda x: x**2)
-        return dataset_ops.Dataset.zip((dataset1, dataset2))
-
-      expected_values = [[(i, i**2), (i, i**2)] for i in range(8)]
-      self._test_dataset(dataset_fn, worker_devices, devices,
-                         expected_values)
-
-  def testInitializableIterator(self):
-    worker_devices, devices = self._cpu_devices()
-    with context.graph_mode(), self.cached_session() as sess:
-      dataset_fn = lambda: dataset_ops.Dataset.range(8)
-      device_map = values.ReplicaDeviceMap(devices)
-      input_workers = values.InputWorkers(device_map, worker_devices)
-      multi_worker_dataset = values.MultiWorkerDataset(
-          dataset_fn, input_workers)
-      multi_worker_iterator = multi_worker_dataset.make_initializable_iterator()
-
-      sess.run(multi_worker_iterator.initializer)
-      self._test_iterator(
-          sess, multi_worker_iterator, devices,
-          [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]])
-
-      # After re-initializing the iterator, should be able to iterate again.
-      sess.run(multi_worker_iterator.initializer)
-      self._test_iterator(
-          sess, multi_worker_iterator, devices,
-          [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]])
-
-  def testValueErrorForIterator(self):
-    # Incompatiable arguments.
-    d1 = "/device:GPU:0"
-    d2 = "/device:GPU:1"
-    device_map = values.ReplicaDeviceMap([d1, d2])
-    input_workers = values.InputWorkers(
-        device_map, (("w1", (d1,)), ("w2", (d2,))))
-    with self.assertRaises(ValueError):
-      values.MultiWorkerDataIterator([("w1", None)], input_workers)
-
-  def testDuplicateDevices(self):
-    _, devices = self._cpu_devices()
-    devices.append("/job:worker/replica:0/task:0/device:CPU:0")
-    with self.assertRaises(ValueError):
-      _ = values.ReplicaDeviceMap(devices)
-
-
-class InputIteratorTestBase(test.TestCase):
-
-  def _test_iterator(self, input_type, dataset_fn, worker_device_pairs,
-                     expected_values, sess=None, split_batch_by=None):
-    devices = nest.flatten([ds for _, ds in worker_device_pairs])
-    device_map = values.ReplicaDeviceMap(devices)
-    input_workers = values.InputWorkers(device_map, worker_device_pairs)
-
-    if input_type == "input_fn":
-      input_contexts = [
-          distribute_lib.InputContext() for _ in worker_device_pairs]
-      input_fn = lambda _: dataset_fn()
-      iterator = values.InputFunctionIterator(
-          input_fn, input_workers, input_contexts)
-    else:
-      iterator = values.DatasetIterator(
-          dataset_fn(), input_workers, split_batch_by)
-
-    evaluate = lambda x: sess.run(x) if sess else self.evaluate(x)
-
-    evaluate(control_flow_ops.group(iterator.initialize()))
-
-    for expected_value in expected_values:
-      next_element = iterator.get_next()
-      computed_value = evaluate(
-          [values.select_replica(r, next_element) for r in range(len(devices))])
-      self.assertAllEqual(expected_value, computed_value)
-
-    with self.assertRaises(errors.OutOfRangeError):
-      next_element = iterator.get_next()
-      evaluate([values.select_replica(r, next_element)
-                for r in range(len(devices))])
-
-    # After re-initializing the iterator, should be able to iterate again.
-    evaluate(control_flow_ops.group(iterator.initialize()))
-
-    for expected_value in expected_values:
-      next_element = iterator.get_next()
-      computed_value = evaluate(
-          [values.select_replica(r, next_element) for r in range(len(devices))])
-      self.assertAllEqual(expected_value, computed_value)
-
-
-class InputIteratorSingleWorkerTest(InputIteratorTestBase,
-                                    parameterized.TestCase):
-
-  @combinations.generate(combinations.combine(
-      mode=["graph", "eager"],
-      input_type=["input_fn", "dataset"]))
-  def testOneDeviceCPU(self, input_type):
-    worker_device_pairs = [("", ["/device:CPU:0"])]
-    dataset_fn = lambda: dataset_ops.Dataset.range(10)
-
-    expected_values = [[i] for i in range(10)]
-
-    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
-                        expected_values)
-
-  @combinations.generate(combinations.combine(
-      mode=["graph", "eager"],
-      input_type=["input_fn", "dataset"],
-      required_gpus=1))
-  def testTwoDevicesOneGPUOneCPU(self, input_type):
-    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
-    dataset_fn = lambda: dataset_ops.Dataset.range(10)
-
-    expected_values = [[i, i+1] for i in range(0, 10, 2)]
-
-    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
-                        expected_values)
-
-  @combinations.generate(combinations.combine(
-      mode=["graph", "eager"],
-      input_type=["input_fn", "dataset"],
-      required_gpus=1))
-  def testTupleDataset(self, input_type):
-    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
-    def dataset_fn():
-      dataset1 = dataset_ops.Dataset.range(10)
-      dataset2 = dataset_ops.Dataset.range(10).map(lambda x: x**2)
-      return dataset_ops.Dataset.zip((dataset1, dataset2))
-
-    expected_values = [[(i, i**2), (i+1, (i+1)**2)] for i in range(0, 10, 2)]
-
-    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
-                        expected_values)
-
-  @combinations.generate(combinations.combine(
-      mode=["graph", "eager"],
-      input_type=["input_fn", "dataset"],
-      required_gpus=1))
-  def testUnevenDatasetBatches(self, input_type):
-    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
-    dataset_fn = lambda: dataset_ops.Dataset.range(11)
-
-    expected_values = [[i, i+1] for i in range(0, 10, 2)]
-    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
-                        expected_values)
-
-  @combinations.generate(combinations.combine(
-      mode=["graph", "eager"],
-      input_type=["dataset"],
-      split_batch_by=[None, 2],
-      required_gpus=1))
-  def testBatchSplitting(self, input_type, split_batch_by):
-    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
-    batch_size = 10
-    dataset_fn = lambda: dataset_ops.Dataset.range(100).batch(batch_size)
-
-    updated_batch_size = (
-        batch_size // split_batch_by if split_batch_by else batch_size)
-    expected_values = [[range(i, i+updated_batch_size),
-                        range(i+updated_batch_size, i+2*updated_batch_size)]
-                       for i in range(0, 100, updated_batch_size*2)]
-
-    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
-                        expected_values, sess=None,
-                        split_batch_by=split_batch_by)
-
-
-class InputIteratorMultiWorkerTest(
-    multi_worker_test_base.MultiWorkerTestBase, InputIteratorTestBase,
-    parameterized.TestCase):
-
-  def _cpu_devices(self):
-    return [
-        ("/job:worker/replica:0/task:0",
-         ["/job:worker/replica:0/task:0/device:CPU:0"]),
-        ("/job:worker/replica:0/task:1",
-         ["/job:worker/replica:0/task:1/device:CPU:0"])]
-
-  def _cpu_and_one_gpu_devices(self):
-    return [
-        ("/job:worker/replica:0/task:0", [
-            "/job:worker/replica:0/task:0/device:GPU:0",
-            "/job:worker/replica:0/task:0/device:CPU:0"
-        ]),
-        ("/job:worker/replica:0/task:1", [
-            "/job:worker/replica:0/task:1/device:GPU:0",
-            "/job:worker/replica:0/task:1/device:CPU:0"
-        ])
-    ]
-
-  @combinations.generate(combinations.combine(
-      mode=["graph"],
-      input_type=["input_fn", "dataset"]))
-  def testOneDevicePerWorker(self, input_type):
-    worker_devices = self._cpu_devices()
-    with context.graph_mode(), self.cached_session() as sess:
-      dataset_fn = lambda: dataset_ops.Dataset.range(4)
-      self._test_iterator(input_type, dataset_fn, worker_devices,
-                          [[0, 0], [1, 1], [2, 2], [3, 3]], sess)
-
-  @combinations.generate(combinations.combine(
-      mode=["graph"],
-      input_type=["input_fn", "dataset"],
-      required_gpus=1))
-  def testTwoDevicesPerWorker(self, input_type):
-    worker_devices = self._cpu_and_one_gpu_devices()
-    with context.graph_mode(), self.cached_session() as sess:
-      dataset_fn = lambda: dataset_ops.Dataset.range(4)
-      self._test_iterator(input_type, dataset_fn, worker_devices,
-                          [[0, 1, 0, 1], [2, 3, 2, 3]], sess)
-
-  @combinations.generate(combinations.combine(
-      mode=["graph"],
-      input_type=["input_fn", "dataset"]))
-  def testTupleDataset(self, input_type):
-    worker_devices = self._cpu_devices()
-    with context.graph_mode(), self.cached_session() as sess:
-      def dataset_fn():
-        dataset1 = dataset_ops.Dataset.range(4)
-        dataset2 = dataset_ops.Dataset.range(4).map(lambda x: x**2)
-        return dataset_ops.Dataset.zip((dataset1, dataset2))
-
-      expected_values = [[(i, i**2), (i, i**2)] for i in range(0, 4)]
-      self._test_iterator(input_type, dataset_fn, worker_devices,
-                          expected_values, sess)
-
-
-class SplitDatasetBatchTest(test.TestCase):
-
-  def testBatchDataset(self):
-    dataset = dataset_ops.Dataset.range(100).batch(20)
-    split_batch_by = 2
-    result_dataset = values._split_dataset_batch(dataset, split_batch_by)
-    expected_values = [range(i, i+10) for i in range(0, 100, 10)]
-    result = [self.evaluate(el) for el in result_dataset]
-    self.assertAllEqual(expected_values, result)
-
-  def testMapAndBatchDataset(self):
-    dataset = dataset_ops.Dataset.range(100)
-    dataset = dataset.apply(batching.map_and_batch(lambda x: x, 20))
-    split_batch_by = 2
-    result_dataset = values._split_dataset_batch(dataset, split_batch_by)
-    expected_values = [range(i, i+10) for i in range(0, 100, 10)]
-    result = [self.evaluate(el) for el in result_dataset]
-    self.assertAllEqual(expected_values, result)
-
-  def testPrefetchDataset(self):
-    dataset = dataset_ops.Dataset.range(100).batch(20).prefetch(1)
-    split_batch_by = 2
-    result_dataset = values._split_dataset_batch(dataset, split_batch_by)
-    expected_values = [range(i, i+10) for i in range(0, 100, 10)]
-    result = [self.evaluate(el) for el in result_dataset]
-    self.assertAllEqual(expected_values, result)
-
-
 class MirroredVariableTest(test.TestCase, parameterized.TestCase):
 
   config = config_pb2.ConfigProto()
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
index 15776c694e92825895437a4c1547699f6d9269fb..9b5a2c947b153308c83f1a922d06c034ec5f9ddf 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
@@ -128,7 +128,7 @@ class PTBModel(tf.keras.Model):
 
     self.linear = layers.Dense(
         vocab_size, kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1))
-    self._output_shape = [-1, embedding_dim]
+    self._output_shape = [-1, hidden_dim]
 
   def call(self, input_seq, training):
     """Run the forward pass of PTBModel.
diff --git a/tensorflow/contrib/feature_column/BUILD b/tensorflow/contrib/feature_column/BUILD
index 4c1d1a29f20b5574b63cf87ecf62db95f92902cd..4e29e2559986012d8eeeaec807f14181226363aa 100644
--- a/tensorflow/contrib/feature_column/BUILD
+++ b/tensorflow/contrib/feature_column/BUILD
@@ -6,7 +6,7 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 py_library(
     name = "feature_column_py",
@@ -37,13 +37,13 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "sequence_feature_column_test",
     srcs = ["python/feature_column/sequence_feature_column_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
+    additional_deps = [
         ":sequence_feature_column",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
@@ -53,17 +53,14 @@ py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
         "//tensorflow/python/feature_column:feature_column_py",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
+    tags = ["no_pip"],
 )
 
-py_test(
+tf_py_test(
     name = "sequence_feature_column_integration_test",
     srcs = ["python/feature_column/sequence_feature_column_integration_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
+    additional_deps = [
         ":sequence_feature_column",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
@@ -73,6 +70,7 @@ py_test(
         "//tensorflow/python/feature_column:feature_column_py",
         "//tensorflow/python/keras:layers",
     ],
+    tags = ["no_pip"],
 )
 
 py_library(
@@ -94,14 +92,14 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "sequence_feature_column_v2_test",
     srcs = ["python/feature_column/sequence_feature_column_v2_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
+    additional_deps = [
         ":sequence_feature_column",
         ":sequence_feature_column_v2",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
@@ -112,7 +110,6 @@ py_test(
         "//tensorflow/python:training",
         "//tensorflow/python/feature_column:feature_column_py",
         "//tensorflow/python/feature_column:feature_column_v2_test",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
+    tags = ["no_pip"],
 )
diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD
index dad50a3a73085526f65bd87c3d8549ceb75b3af4..88a14a2a94cc683f021d032ea11358e0cfb63faa 100644
--- a/tensorflow/contrib/framework/BUILD
+++ b/tensorflow/contrib/framework/BUILD
@@ -50,6 +50,7 @@ tf_custom_op_py_library(
     visibility = [
         "//learning/brain:__subpackages__",
         "//tensorflow:__subpackages__",
+        "//tensorflow_estimator:__subpackages__",
         "//video/youtube/personalization:__subpackages__",
     ],
     deps = [
diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD
index ae8320cfb279ac3a5cf7cb17ea2d82da79dbe432..db0868fb2c43464a811b3d6dfcd96480ba2463ee 100644
--- a/tensorflow/contrib/gan/BUILD
+++ b/tensorflow/contrib/gan/BUILD
@@ -1,8 +1,7 @@
-# Files for using TFGAN framework.
+# Files for using TF-GAN framework.
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 package(default_visibility = [
-    "//learning/brain/contrib/tfgan:__subpackages__",
     "//tensorflow:__subpackages__",
 ])
 
@@ -107,6 +106,7 @@ py_library(
     deps = [
         ":gan_estimator",
         ":head",
+        ":latent_gan_estimator",
         ":stargan_estimator",
         ":tpu_gan_estimator",
         "//tensorflow/python:util",
@@ -132,6 +132,7 @@ py_library(
         ":clip_weights",
         ":conditioning_utils",
         ":random_tensor_pool",
+        ":spectral_normalization",
         ":virtual_batchnorm",
         "//tensorflow/python:util",
     ],
@@ -145,16 +146,15 @@ py_library(
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:clip_ops",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:gradients",
+        "//tensorflow/python:gradients_impl",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:summary",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/ops/distributions",
         "//tensorflow/python/ops/losses",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -570,22 +570,18 @@ py_test(
     deps = [
         ":namedtuples",
         ":stargan_estimator",
-        ":tuple_losses",
         "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/contrib/learn",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
-        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:summary",
         "//tensorflow/python:training",
         "//tensorflow/python:training_util",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/estimator:numpy_io",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
         "@six_archive//:six",
@@ -647,6 +643,41 @@ py_test(
     ],
 )
 
+py_library(
+    name = "latent_gan_estimator",
+    srcs = [
+        "python/estimator/python/latent_gan_estimator.py",
+        "python/estimator/python/latent_gan_estimator_impl.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":train",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training_util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/estimator:estimator_py",
+    ],
+)
+
+py_test(
+    name = "latent_gan_estimator_test",
+    srcs = [
+        "python/estimator/python/latent_gan_estimator_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":latent_gan_estimator",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/ops/losses",
+    ],
+)
+
 py_library(
     name = "sliced_wasserstein",
     srcs = [
@@ -681,3 +712,45 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
+
+py_library(
+    name = "spectral_normalization",
+    srcs = [
+        "python/features/python/spectral_normalization.py",
+        "python/features/python/spectral_normalization_impl.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:standard_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/keras:engine",
+    ],
+)
+
+py_test(
+    name = "spectral_normalization_test",
+    srcs = ["python/features/python/spectral_normalization_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":spectral_normalization",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/slim",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/keras:layers",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/contrib/gan/README.md b/tensorflow/contrib/gan/README.md
index 9ab86329eaf0e6fd426aef1f552f4e27c2ad65de..db7dc51daa78ecee12ecb7f6d33df4511e068243 100644
--- a/tensorflow/contrib/gan/README.md
+++ b/tensorflow/contrib/gan/README.md
@@ -1,14 +1,15 @@
 <!-- TODO(joelshor): Add images to the examples. -->
-# TensorFlow-GAN (TFGAN)
+<!-- TODO(joelshor): Add link to new location when b/122114187 is done. -->
+# TensorFlow-GAN (TF-GAN)
 
-TFGAN is a lightweight library for training and evaluating Generative
+TF-GAN is a lightweight library for training and evaluating Generative
 Adversarial Networks (GANs). This technique allows you to train a network
 (called the 'generator') to sample from a distribution, without having to
 explicitly model the distribution and without writing an explicit loss. For
 example, the generator could learn to draw samples from the distribution of
 natural images. For more details on this technique, see
 ['Generative Adversarial Networks'](https://arxiv.org/abs/1406.2661) by
-Goodfellow et al. See [tensorflow/models](https://github.com/tensorflow/models/tree/master/research/gan/) for examples, and [this tutorial](https://github.com/tensorflow/models/tree/master/research/gan/tutorial.ipynb) for an
+Goodfellow et al. See [tensorflow/models](https://github.com/tensorflow/models/tree/master/research/gan/) for examples, and [this tutorial](http://https://github.com/tensorflow/models/tree/master/research/gan/tutorial.ipynb) for an
 introduction.
 
 #### Usage
@@ -17,27 +18,27 @@ import tensorflow as tf
 tfgan = tf.contrib.gan
 ```
 
-## Why TFGAN?
+## Why TF-GAN?
 
 * Easily train generator and discriminator networks with well-tested, flexible [library calls](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/train.py). You can
-mix TFGAN, native TF, and other custom frameworks
+mix TF-GAN, native TF, and other custom frameworks
 * Use already implemented [GAN losses and penalties](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/losses/python/losses_impl.py) (ex Wasserstein loss, gradient penalty, mutual information penalty, etc)
 * [Monitor and visualize](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/eval/python/summaries_impl.py) GAN progress during training, and [evaluate](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py) them
 * Use already-implemented [tricks](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/features/python/) to stabilize and improve training
 * Develop based on examples of [common GAN setups](https://github.com/tensorflow/models/tree/master/research/gan/)
-* Use the TFGAN-backed [GANEstimator](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py) to easily train a GAN model
-* Improvements in TFGAN infrastructure will automatically benefit your TFGAN project
+* Use the TF-GAN-backed [GANEstimator](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py) to easily train a GAN model
+* Improvements in TF-GAN infrastructure will automatically benefit your TF-GAN project
 * Stay up-to-date with research as we add more algorithms
 
-## What are the TFGAN components?
+## What are the TF-GAN components?
 
-TFGAN is composed of several parts which were design to exist independently.
+TF-GAN is composed of several parts which were design to exist independently.
 These include the following main pieces (explained in detail below).
 
 *   [core](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/train.py):
     provides the main infrastructure needed to train a GAN. Training occurs in
     four phases, and each phase can be completed by custom-code or by using a
-    TFGAN library call.
+    TF-GAN library call.
 
 *   [features](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/features/python/):
     Many common GAN operations and normalization techniques are implemented for
@@ -56,14 +57,15 @@ These include the following main pieces (explained in detail below).
     generative models.
 
 *   [examples](https://github.com/tensorflow/models/tree/master/research/gan/)
-    and [tutorial](https://github.com/tensorflow/models/tree/master/research/gan/tutorial.ipynb): See examples of how to use TFGAN to make
-    GAN training easier, or use the more complicated examples to jumpstart your
-    own project. These include unconditional and conditional GANs, InfoGANs,
-    adversarial losses on existing networks, and image-to-image translation.
+    and [tutorial](http://https://github.com/tensorflow/models/tree/master/research/gan/tutorial.ipynb): See examples of how to use TF-GAN
+    to make GAN training easier, or use the more complicated examples to
+    jumpstart your own project. These include unconditional and conditional
+    GANs, InfoGANs, adversarial losses on existing networks, and image-to-image
+    translation.
 
 ## Training a GAN model
 
-Training in TFGAN typically consists of the following steps:
+Training in TF-GAN typically consists of the following steps:
 
 1. Specify the input to your networks.
 1. Set up your generator and discriminator using a `GANModel`.
@@ -71,12 +73,12 @@ Training in TFGAN typically consists of the following steps:
 1. Create your train ops using a `GANTrainOps`.
 1. Run your train ops.
 
-At each stage, you can either use TFGAN's convenience functions, or you can
+At each stage, you can either use TF-GAN's convenience functions, or you can
 perform the step manually for fine-grained control. We provide examples below.
 
 There are various types of GAN setups. For instance, you can train a generator
 to sample unconditionally from a learned distribution, or you can condition on
-extra information such as a class label. TFGAN is compatible with many setups,
+extra information such as a class label. TF-GAN is compatible with many setups,
 and we demonstrate a few below:
 
 ### Examples
@@ -254,9 +256,9 @@ with variable_scope.variable_scope(dis_scope, reuse=True):
   discriminator_real_outputs = discriminator_fn(images)
 generator_variables = variables_lib.get_trainable_variables(gen_scope)
 discriminator_variables = variables_lib.get_trainable_variables(dis_scope)
-# Depending on what TFGAN features you use, you don't always need to supply
+# Depending on what TF-GAN features you use, you don't always need to supply
 # every `GANModel` field. At a minimum, you need to include the discriminator
-# outputs and variables if you want to use TFGAN to construct losses.
+# outputs and variables if you want to use TF-GAN to construct losses.
 gan_model = tfgan.GANModel(
     generator_inputs,
     generated_data,
diff --git a/tensorflow/contrib/gan/__init__.py b/tensorflow/contrib/gan/__init__.py
index f1946c7f925660eae3aaa650c437e03da1f33d6c..1e6000898f7b8a53ad3f6fa12deebd54bf3a57ff 100644
--- a/tensorflow/contrib/gan/__init__.py
+++ b/tensorflow/contrib/gan/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN is a lightweight library for training and evaluating GANs.
+"""TF-GAN is a lightweight library for training and evaluating GANs.
 
 In addition to providing the infrastructure for easily training and evaluating
 GANS, this library contains modules for a TFGAN-backed Estimator,
@@ -24,7 +24,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# Collapse TFGAN into a tiered namespace.
+# Collapse TF-GAN into a tiered namespace.
 from tensorflow.contrib.gan.python import estimator
 from tensorflow.contrib.gan.python import eval  # pylint:disable=redefined-builtin
 from tensorflow.contrib.gan.python import features
diff --git a/tensorflow/contrib/gan/python/estimator/__init__.py b/tensorflow/contrib/gan/python/estimator/__init__.py
index 75cccb5ea004c11f250050fedf7475dec6d3b699..430266555b723e6ca39dccffc1442dbef5d4a385 100644
--- a/tensorflow/contrib/gan/python/estimator/__init__.py
+++ b/tensorflow/contrib/gan/python/estimator/__init__.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN estimator module.
+"""TF-GAN estimator module.
 
 GANEstimator provides all the infrastructure support of a TensorFlow Estimator
-with the feature support of TFGAN.
+with the feature support of TF-GAN.
 """
 
 from __future__ import absolute_import
@@ -26,11 +26,13 @@ from __future__ import print_function
 # pylint: disable=unused-import,wildcard-import
 from tensorflow.contrib.gan.python.estimator.python import gan_estimator
 from tensorflow.contrib.gan.python.estimator.python import head
+from tensorflow.contrib.gan.python.estimator.python import latent_gan_estimator
 from tensorflow.contrib.gan.python.estimator.python import stargan_estimator
 from tensorflow.contrib.gan.python.estimator.python import tpu_gan_estimator
 
 from tensorflow.contrib.gan.python.estimator.python.gan_estimator import *
 from tensorflow.contrib.gan.python.estimator.python.head import *
+from tensorflow.contrib.gan.python.estimator.python.latent_gan_estimator import *
 from tensorflow.contrib.gan.python.estimator.python.stargan_estimator import *
 from tensorflow.contrib.gan.python.estimator.python.tpu_gan_estimator import *
 # pylint: enable=unused-import,wildcard-import
@@ -41,7 +43,8 @@ _allowed_symbols = ([
     'gan_estimator',
     'stargan_estimator',
     'tpu_gan_estimator',
+    'latent_gan_estimator',
     'head',
 ] + gan_estimator.__all__ + stargan_estimator.__all__ + head.__all__ +
-                    tpu_gan_estimator.__all__)
+                    tpu_gan_estimator.__all__ + latent_gan_estimator.__all__)
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
index adb72228217892fffc10b0e2630edcd9d3e38a02..dd904611d1a3bb78de8316d5ed29ab0f800f29a9 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""A TFGAN-backed GAN Estimator."""
+"""A TF-GAN-backed GAN Estimator."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -56,10 +56,10 @@ _summary_type_map = {
 class GANEstimator(estimator.Estimator):
   """An estimator for Generative Adversarial Networks (GANs).
 
-  This Estimator is backed by TFGAN. The network functions follow the TFGAN API
-  except for one exception: if either `generator_fn` or `discriminator_fn` have
-  an argument called `mode`, then the tf.Estimator mode is passed in for that
-  argument. This helps with operations like batch normalization, which have
+  This Estimator is backed by TF-GAN. The network functions follow the TF-GAN
+  API except for one exception: if either `generator_fn` or `discriminator_fn`
+  have an argument called `mode`, then the tf.Estimator mode is passed in for
+  that argument. This helps with operations like batch normalization, which have
   different train and evaluation behavior.
 
   Example:
@@ -68,7 +68,7 @@ class GANEstimator(estimator.Estimator):
       import tensorflow as tf
       tfgan = tf.contrib.gan
 
-      # See TFGAN's `train.py` for a description of the generator and
+      # See TF-GAN's `train.py` for a description of the generator and
       # discriminator API.
       def generator_fn(generator_inputs):
         ...
@@ -123,13 +123,13 @@ class GANEstimator(estimator.Estimator):
         to continue training a previously saved model.
       generator_fn: A python function that takes a Tensor, Tensor list, or
         Tensor dictionary as inputs and returns the outputs of the GAN
-        generator. See `TFGAN` for more details and examples. Additionally, if
+        generator. See `TF-GAN` for more details and examples. Additionally, if
         it has an argument called `mode`, the Estimator's `mode` will be passed
         in (ex TRAIN, EVAL, PREDICT). This is useful for things like batch
         normalization.
       discriminator_fn: A python function that takes the output of
         `generator_fn` or real data in the GAN setup, and `generator_inputs`.
-        Outputs a Tensor in the range [-inf, inf]. See `TFGAN` for more details
+        Outputs a Tensor in the range [-inf, inf]. See `TF-GAN` for more details
         and examples.
       generator_loss_fn: The loss function on the generator. Takes a `GANModel`
         tuple.
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
index 5a3d29cf0b3cb1bbe03cb5ba4f327caf46432b76..5b9c54e43a16adf457d5ed0e7e73dcd168ab0d67 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for TFGAN's estimator.py."""
+"""Tests for TF-GAN's estimator.py."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/estimator/python/head_impl.py b/tensorflow/contrib/gan/python/estimator/python/head_impl.py
index 1a0ee6dfc498eb6dc8c97411589d9e35bc352062..cbe990b476c3b17ce61e0826b17d10976fea43c7 100644
--- a/tensorflow/contrib/gan/python/estimator/python/head_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/head_impl.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""A TFGAN-backed GAN Estimator."""
+"""A TF-GAN-backed GAN Estimator."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/estimator/python/head_test.py b/tensorflow/contrib/gan/python/estimator/python/head_test.py
index 8205bc889dc01c8680e2139393d65723280cfbd0..5b50234a0e33cd297b176f142b358338966b6758 100644
--- a/tensorflow/contrib/gan/python/estimator/python/head_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/head_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for TFGAN's head.py."""
+"""Tests for TF-GAN's head.py."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator.py b/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e164e24168bb0cc5e9a7cc772081781ea088bb1
--- /dev/null
+++ b/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator.py
@@ -0,0 +1,28 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""`tf.Learn` components for `Train Input Estimator`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.gan.python.estimator.python import latent_gan_estimator_impl
+# pylint: disable=wildcard-import
+from tensorflow.contrib.gan.python.estimator.python.latent_gan_estimator_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+__all__ = latent_gan_estimator_impl.__all__
+remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5afc7731937ed1a82c8ebb5969b2687ffdd583b
--- /dev/null
+++ b/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator_impl.py
@@ -0,0 +1,205 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implements an estimator wrapper that allows training the input latent space.
+
+This file implements a latent gan estimator that wraps around a previously
+trained GAN. The latent gan estimator trains a single variable z, representing
+the hidden latent distribution that is the 'noise' input to the GAN. By training
+z, the inpainting estimator can move around the latent z space towards
+minimizing a specific loss function.
+
+The latent gan estimator has a few key differences from a normal estimator.
+
+First: the variables in the estimator should not be saved, as we are not
+updating the original GAN and are only adding a new z variable that is meant
+to be different for each run. In order to do distributed training using
+train_and_evaluate, the Tensorflow RunConfig is expected to save checkpoints
+by having either save_checkpoints_steps or save_checkpoints_secs saved.
+To avoid this conflict, we purposely set the save_checkpoints_steps value in
+the RunConfig to be one step more than the total number of steps that the
+inpainter estimator will run.
+
+Second: we need to specify warm start settings, as we are reloading the
+GAN model into a different graph (specifically, one with a new z variable).
+The warm start settings defined below reload all GAN variables and ignore the
+new z variable (and the optimizer).
+
+Usage:
+
+  def _generator(net, mode):
+    ...
+
+  def _discriminator(net, condition, mode):
+    ...
+
+  def _loss(gan_model, features, labels, add_summaries):
+    ...
+
+  def optimizer():
+    ...
+
+  params = {<required params>}
+  config = tf.estimator.RunConfig()
+  tmp_dir = path/to/output/storage
+
+  estimator = latent_gan_estimator.get_latent_gan_estimator(
+      _generator, _discriminator, _loss, optimizer, params, config, tmp_dir)
+
+  def input_fn():
+    ...
+
+  estimator.train(input_fn=input_fn)
+
+See latent_gan_estimator_test.py or tensorflow_models/gan/face_inpainting for
+further examples.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+from tensorflow.contrib.gan.python import train as tfgan_train
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.summary import summary
+from tensorflow.python.training import training_util
+
+
+INPUT_NAME = 'new_var_z_input'  # The name for the new z space input variable.
+OPTIMIZER_NAME = 'latent_gan_optimizer'  # The name for the new optimizer vars.
+
+__all__ = [
+    'get_latent_gan_estimator',
+]
+
+
+def _get_latent_gan_model_fn(generator_fn, discriminator_fn, loss_fn,
+                             optimizer):
+  """Sets up a model function that wraps around a given GAN."""
+  def model_fn(features, labels, mode, params):
+    """Model function defining an inpainting estimator."""
+    batch_size = params['batch_size']
+    z_shape = [batch_size] + params['z_shape']
+    add_summaries = params['add_summaries']
+    input_clip = params['input_clip']
+
+    z = variable_scope.get_variable(
+        name=INPUT_NAME, initializer=random_ops.truncated_normal(z_shape),
+        constraint=lambda x: clip_ops.clip_by_value(x, -input_clip, input_clip))
+
+    generator = functools.partial(generator_fn, mode=mode)
+    discriminator = functools.partial(discriminator_fn, mode=mode)
+    gan_model = tfgan_train.gan_model(generator_fn=generator,
+                                      discriminator_fn=discriminator,
+                                      real_data=labels,
+                                      generator_inputs=z,
+                                      check_shapes=False)
+
+    loss = loss_fn(gan_model, features, labels, add_summaries)
+
+    # Use a variable scope to make sure that estimator variables dont cause
+    # save/load problems when restoring from ckpts.
+    with variable_scope.variable_scope(OPTIMIZER_NAME):
+      opt = optimizer(learning_rate=params['learning_rate'],
+                      **params['opt_kwargs'])
+      train_op = opt.minimize(
+          loss=loss, global_step=training_util.get_or_create_global_step(),
+          var_list=[z])
+
+    if add_summaries:
+      z_grads = gradients_impl.gradients(loss, z)
+      summary.scalar('z_loss/z_grads', clip_ops.global_norm(z_grads))
+      summary.scalar('z_loss/loss', loss)
+
+    return model_fn_lib.EstimatorSpec(mode=mode,
+                                      predictions=gan_model.generated_data,
+                                      loss=loss,
+                                      train_op=train_op)
+  return model_fn
+
+
+def get_latent_gan_estimator(generator_fn, discriminator_fn, loss_fn,
+                             optimizer, params, config, ckpt_dir,
+                             warmstart_options=True):
+  """Gets an estimator that passes gradients to the input.
+
+  This function takes in a generator and adds a trainable z variable that is
+  used as input to this generator_fn. The generator itself is treated as a black
+  box through which gradients can pass through without updating any weights. The
+  result is a trainable way to traverse the GAN latent space. The loss_fn is
+  used to actually train the z variable. The generator_fn and discriminator_fn
+  should be previously trained by the tfgan library (on reload, the variables
+  are expected to follow the tfgan format. It may be possible to use the
+  latent gan estimator with entirely custom GANs that do not use the tfgan
+  library as long as the appropriate variables are wired properly).
+
+  Args:
+    generator_fn: a function defining a Tensorflow graph for a GAN generator.
+      The weights defined in this graph should already be defined in the given
+      checkpoint location. Should have 'mode' as an argument.
+    discriminator_fn: a function defining a Tensorflow graph for a GAN
+      discriminator. Should have 'mode' as an argument.
+    loss_fn: a function defining a Tensorflow graph for a GAN loss. Takes in a
+      GANModel tuple, features, labels, and add_summaries as inputs.
+    optimizer: a tf.Optimizer or a function that returns a tf.Optimizer with no
+      inputs.
+   params: An object containing the following parameters:
+      - batch_size: an int indicating the size of the training batch.
+      - z_shape: the desired shape of the input z values (not counting batch).
+      - learning_rate: a scalar or function defining a learning rate applied to
+        optimizer.
+      - input_clip: the amount to clip the x training variable by.
+      - add_summaries: whether or not to add summaries.
+      - opt_kwargs: optimizer kwargs.
+    config: tf.RunConfig. Should point model to output dir and should indicate
+     whether to save checkpoints (to avoid saving checkpoints, set
+     save_checkpoints_steps to a number larger than the number of train steps).
+     The model_dir field in the RunConfig should point to a directory WITHOUT
+     any saved checkpoints.
+    ckpt_dir: the directory where the model checkpoints live. The checkpoint is
+     used to warm start the underlying GAN. This should NOT be the same as
+     config.model_dir.
+    warmstart_options: boolean, None, or a WarmStartSettings object. If set to
+      True, uses a default WarmStartSettings object. If set to False or None,
+      does not use warm start. If using a custom WarmStartSettings object, make
+      sure that new variables are properly accounted for when reloading the
+      underlying GAN. Defaults to True.
+  Returns:
+    An estimator spec defining a GAN input training estimator.
+  """
+  model_fn = _get_latent_gan_model_fn(generator_fn, discriminator_fn,
+                                      loss_fn, optimizer)
+
+  if isinstance(warmstart_options, estimator.WarmStartSettings):
+    ws = warmstart_options
+  elif warmstart_options:
+    # Default WarmStart loads all variable names except INPUT_NAME and
+    # OPTIMIZER_NAME.
+    var_regex = '^(?!.*(%s|%s).*)' % (INPUT_NAME, OPTIMIZER_NAME)
+    ws = estimator.WarmStartSettings(ckpt_to_initialize_from=ckpt_dir,
+                                     vars_to_warm_start=var_regex)
+  else:
+    ws = None
+
+  if 'opt_kwargs' not in params:
+    params['opt_kwargs'] = {}
+
+  return estimator.Estimator(model_fn=model_fn, config=config, params=params,
+                             warm_start_from=ws)
diff --git a/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac139e532e35f7aae6da0655103a7249fe3382d4
--- /dev/null
+++ b/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator_test.py
@@ -0,0 +1,119 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for latent_gan_estimator.
+
+See g3.tp.tensorflow.contrib.gan.python.estimator.python.latent_gan_estimator.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tempfile
+import numpy as np
+from tensorflow.contrib.gan.python.estimator.python import latent_gan_estimator
+from tensorflow.python.estimator import run_config as run_config
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import test
+from tensorflow.python.training import training
+
+
+class TrainInputEstimatorTest(test.TestCase):
+
+  def test_get_input_training_estimator(self):
+    """Integration test to make sure the input_training_estimator works."""
+
+    # Create dummy test input tensors.
+    true_features = np.reshape(np.random.uniform(size=100), (10, 10))
+    true_labels = np.reshape(np.random.uniform(size=100), (5, 20))
+    expected_z_output = [[1, -1], [-1, 1]]
+
+    # Fill out required parameters randomly, includes optimizer kwargs.
+    params = {
+        'batch_size': 2,
+        'z_shape': [2],
+        'learning_rate': 1.0,
+        'input_clip': 1.0,
+        'add_summaries': False,
+        'opt_kwargs': {
+            'beta1': 0.1
+        }
+    }
+
+    input_z_shape = [params['batch_size']] + params['z_shape']
+
+    # Create dummy model functions that represent an underlying GANEstimator and
+    # the input training wrapper. Make sure that everything is wired up
+    # correctly in the internals of each dummy function.
+    def _generator(net, mode):
+      """The generator function will get the newly created z variable."""
+      del mode
+      self.assertSequenceEqual(net.shape, input_z_shape)
+      gen_dummy_var = variable_scope.get_variable(
+          name='generator_dummy_variable',
+          initializer=array_ops.ones(input_z_shape))
+      return net * gen_dummy_var
+
+    def _discriminator(net, condition, mode):
+      """The discriminator function will get either the z variable or labels."""
+      del condition, mode
+      try:
+        self.assertSequenceEqual(net.shape, true_labels.shape)
+      except AssertionError:
+        self.assertSequenceEqual(net.shape, input_z_shape)
+      return net
+
+    def _loss(gan_model, features, labels, _):
+      """Make sure that features and labels are passed in from input."""
+      self.assertTrue(np.array_equal(features, true_features))
+      self.assertTrue(np.array_equal(labels, true_labels))
+      return losses.absolute_difference(expected_z_output,
+                                        gan_model.generated_data)
+
+    optimizer = training.AdamOptimizer
+
+    # We are not loading checkpoints, so set the corresponding directory to a
+    # dummy directories.
+    tmp_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig(model_dir=tmp_dir,
+                                  save_summary_steps=None,
+                                  save_checkpoints_steps=1,
+                                  save_checkpoints_secs=None)
+
+    # Get the estimator. Disable warm start so that there is no attempted
+    # checkpoint reloading.
+    estimator = latent_gan_estimator.get_latent_gan_estimator(
+        _generator, _discriminator, _loss, optimizer, params, config, tmp_dir,
+        warmstart_options=None)
+
+    # Train for a few steps.
+    def dummy_input():
+      return true_features, true_labels
+    estimator.train(input_fn=dummy_input, steps=10)
+
+    # Make sure the generator variables did not change, but the z variables did
+    # change.
+    self.assertTrue(np.array_equal(
+        estimator.get_variable_value('Generator/generator_dummy_variable'),
+        np.ones(input_z_shape)))
+    self.assertTrue(np.array_equal(
+        estimator.get_variable_value('new_var_z_input'),
+        expected_z_output))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_impl.py
index f60e16bc04662b33bc0bb22b5acc8c7fcc7a03ba..2a485e7d47ff10cf34c1b44f8dcc6b1f33c9a05f 100644
--- a/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_impl.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""A TFGAN-backed StarGAN Estimator."""
+"""A TF-GAN-backed StarGAN Estimator."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py
index 2ec7938c7c4051842c7e982b54c1213b6e841b79..c00ff4399748a77f88d9753df7592bf3859d754e 100644
--- a/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for TFGAN's stargan_estimator.py."""
+"""Tests for TF-GAN's stargan_estimator.py."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -80,7 +80,7 @@ class StarGetGANModelTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(input_data, gan_model.input_data)
     self.assertIsNotNone(gan_model.generated_data)
     self.assertIsNotNone(gan_model.generated_data_domain_target)
-    self.assertEqual(1, len(gan_model.generator_variables))
+    self.assertLen(gan_model.generator_variables, 1)
     self.assertIsNotNone(gan_model.generator_scope)
     self.assertIsNotNone(gan_model.generator_fn)
     if mode == model_fn_lib.ModeKeys.PREDICT:
@@ -109,7 +109,7 @@ class StarGetGANModelTest(test.TestCase, parameterized.TestCase):
           gan_model.discriminator_input_data_domain_predication)
       self.assertIsNotNone(
           gan_model.discriminator_generated_data_domain_predication)
-      self.assertEqual(2, len(gan_model.discriminator_variables))  # 1 FC layer
+      self.assertLen(gan_model.discriminator_variables, 2)  # 1 FC layer
       self.assertIsNotNone(gan_model.discriminator_scope)
       self.assertIsNotNone(gan_model.discriminator_fn)
 
@@ -163,6 +163,7 @@ class GetEstimatorSpecTest(test.TestCase, parameterized.TestCase):
 
   @classmethod
   def setUpClass(cls):
+    super(GetEstimatorSpecTest, cls).setUpClass()
     cls._generator_optimizer = training.GradientDescentOptimizer(1.0)
     cls._discriminator_optimizer = training.GradientDescentOptimizer(1.0)
 
diff --git a/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_impl.py
index 295d1382e2cefc6c6e016416dc1f4cddd4f3a46a..8f2a22c78a304c7cc66ef069a235483e9279b3b2 100644
--- a/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_impl.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""A TFGAN-backed GAN Estimator that works on TPU."""
+"""A TF-GAN-backed GAN Estimator that works on TPU."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -294,9 +294,10 @@ def _get_estimator_spec(
         gan_model, gan_loss, gan_loss_no_reduction, get_eval_metric_ops_fn)
   else:  # model_fn_lib.ModeKeys.TRAIN:
     gan_loss = tfgan_tuples.GANLoss(
-        generator_loss=generator_loss_fn(gan_model, add_summaries=False),
+        generator_loss=generator_loss_fn(
+            gan_model, add_summaries=not is_on_tpu),
         discriminator_loss=discriminator_loss_fn(
-            gan_model, add_summaries=False))
+            gan_model, add_summaries=not is_on_tpu))
 
     # Construct optimizers if arguments were callable. For TPUs, they must be
     # `CrossShardOptimizer`.
diff --git a/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_test.py
index 0a08b4386f88d0480effae63f511cf8140dd22f8..9fdcc08334d50b4ddf3a0bc9bc755e55d51b0bd8 100644
--- a/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for TFGAN's TPU Estimator."""
+"""Tests for TF-GAN's TPU Estimator."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/eval/__init__.py b/tensorflow/contrib/gan/python/eval/__init__.py
index f86b8513053a45f9830411f7df2c32d1f36a97b2..92e9abf8a35de1999eb800e169f32220fe47f8cd 100644
--- a/tensorflow/contrib/gan/python/eval/__init__.py
+++ b/tensorflow/contrib/gan/python/eval/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN evaluation module.
+"""TF-GAN evaluation module.
 
 This module supports techniques such as Inception Score, Frechet Inception
 distance, and Sliced Wasserstein distance.
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics.py
index 1c872626a957279132772ae27df7a66a2564e9a5..a52e899114b62cb29752f72aa59f142f4a428aa1 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Model evaluation tools for TFGAN."""
+"""Model evaluation tools for TF-GAN."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
index ea55241b34314eaa64993a7a3855b9cb2f5922b9..31f0d34ed68a6adc25cca102236079d0f66615cb 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Model evaluation tools for TFGAN.
+"""Model evaluation tools for TF-GAN.
 
 These methods come from https://arxiv.org/abs/1606.03498,
 https://arxiv.org/abs/1706.08500, and https://arxiv.org/abs/1801.01401.
@@ -795,9 +795,9 @@ def kernel_classifier_distance(real_images,
       on a classifier.
     num_classifier_batches: Number of batches to split images in to in order to
       efficiently run them through the classifier network.
-    max_estimator_block_size: integer, default 1024. The distance estimator
-      splits samples into blocks for computational efficiency. Larger values are
-      more computationally expensive but decrease the variance of the distance
+    max_block_size: integer, default 1024. The distance estimator splits samples
+      into blocks for computational efficiency. Larger values are more
+      computationally expensive but decrease the variance of the distance
       estimate.
     dtype: if not None, coerce activations to this dtype before computations.
 
@@ -872,9 +872,9 @@ def kernel_classifier_distance_and_std(real_images,
       on a classifier.
     num_classifier_batches: Number of batches to split images in to in order to
       efficiently run them through the classifier network.
-    max_estimator_block_size: integer, default 1024. The distance estimator
-      splits samples into blocks for computational efficiency. Larger values are
-      more computationally expensive but decrease the variance of the distance
+    max_block_size: integer, default 1024. The distance estimator splits samples
+      into blocks for computational efficiency. Larger values are more
+      computationally expensive but decrease the variance of the distance
       estimate. Having a smaller block size also gives a better estimate of the
       standard error.
     dtype: if not None, coerce activations to this dtype before computations.
@@ -911,7 +911,7 @@ def kernel_classifier_distance_and_std(real_images,
   gen_a = array_ops.concat(array_ops.unstack(gen_a), 0)
 
   return kernel_classifier_distance_and_std_from_activations(
-      real_a, gen_a, max_block_size=max_block_size)
+      real_a, gen_a, max_block_size, dtype)
 
 
 kernel_inception_distance_and_std = functools.partial(
@@ -968,14 +968,14 @@ def kernel_classifier_distance_from_activations(real_activations,
       into blocks for computational efficiency. Larger values are more
       computationally expensive but decrease the variance of the distance
       estimate.
-    dtype: if not None, coerce activations to this dtype before computations.
+    dtype: If not None, coerce activations to this dtype before computations.
 
   Returns:
    The Kernel Inception Distance. A floating-point scalar of the same type
    as the output of the activations.
   """
   return kernel_classifier_distance_and_std_from_activations(
-      real_activations, generated_activations, max_block_size=max_block_size)[0]
+      real_activations, generated_activations, max_block_size, dtype)[0]
 
 
 def kernel_classifier_distance_and_std_from_activations(real_activations,
@@ -1030,7 +1030,7 @@ def kernel_classifier_distance_and_std_from_activations(real_activations,
       computationally expensive but decrease the variance of the distance
       estimate. Having a smaller block size also gives a better estimate of the
       standard error.
-    dtype: if not None, coerce activations to this dtype before computations.
+    dtype: If not None, coerce activations to this dtype before computations.
 
   Returns:
    The Kernel Inception Distance. A floating-point scalar of the same type
@@ -1081,7 +1081,7 @@ def kernel_classifier_distance_and_std_from_activations(real_activations,
   dim = math_ops.cast(real_activations.shape[1], dtype)
 
   def compute_kid_block(i):
-    'Compute the ith block of the KID estimate.'
+    """Computes the ith block of the KID estimate."""
     r_s = inds_r[i]
     r_e = inds_r[i + 1]
     r = real_activations[r_s:r_e]
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
index dbff1d2a367e10adc607dafb4c571bb3607a3963..bd17571a0535a3c8e9dfee24a8da16eb2e72f165 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for TFGAN classifier_metrics."""
+"""Tests for TF-GAN classifier_metrics."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -234,7 +234,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
     else:
       logits = classifier_metrics.run_inception(img, _get_dummy_graphdef())
 
-    self.assertTrue(isinstance(logits, ops.Tensor))
+    self.assertIsInstance(logits, ops.Tensor)
     logits.shape.assert_is_compatible_with([batch_size, 1001])
 
     # Check that none of the model variables are trainable.
@@ -258,7 +258,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
           img, _get_dummy_graphdef(),
           output_tensor=classifier_metrics.INCEPTION_FINAL_POOL)
 
-    self.assertTrue(isinstance(pool, ops.Tensor))
+    self.assertIsInstance(pool, ops.Tensor)
     pool.shape.assert_is_compatible_with([batch_size, 2048])
 
     # Check that none of the model variables are trainable.
@@ -276,8 +276,8 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
             classifier_metrics.INCEPTION_FINAL_POOL
         ])
 
-    self.assertTrue(isinstance(logits, ops.Tensor))
-    self.assertTrue(isinstance(pool, ops.Tensor))
+    self.assertIsInstance(logits, ops.Tensor)
+    self.assertIsInstance(pool, ops.Tensor)
     logits.shape.assert_is_compatible_with([batch_size, 1001])
     pool.shape.assert_is_compatible_with([batch_size, 2048])
 
@@ -290,7 +290,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
         classifier_metrics.inception_score,
         array_ops.zeros([6, 299, 299, 3]),
         num_batches=3)
-    self.assertTrue(isinstance(score, ops.Tensor))
+    self.assertIsInstance(score, ops.Tensor)
     score.shape.assert_has_rank(0)
 
     # Check that none of the model variables are trainable.
@@ -302,7 +302,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
     distance = _run_with_mock(
         classifier_metrics.frechet_inception_distance, img, img)
 
-    self.assertTrue(isinstance(distance, ops.Tensor))
+    self.assertIsInstance(distance, ops.Tensor)
     distance.shape.assert_has_rank(0)
 
     # Check that none of the model variables are trainable.
@@ -314,7 +314,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
     distance = _run_with_mock(classifier_metrics.kernel_inception_distance, img,
                               img)
 
-    self.assertTrue(isinstance(distance, ops.Tensor))
+    self.assertIsInstance(distance, ops.Tensor)
     distance.shape.assert_has_rank(0)
 
     # Check that none of the model variables are trainable.
diff --git a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein.py b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein.py
index 523968bed91f1021ae629bf52c405cf5c2d7b917..326fcb3cdbf2eda66207f134cd2926f09a216a99 100644
--- a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein.py
+++ b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Model evaluation tools for TFGAN."""
+"""Model evaluation tools for TF-GAN."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries.py b/tensorflow/contrib/gan/python/eval/python/summaries.py
index ecfdb39499b1e824e02415c0db1de3157e4f3216..1b202dfc97304ddc7ced42d65366aaf419439392 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Common TFGAN summaries."""
+"""Common TF-GAN summaries."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py b/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
index f9995bb19d0d09eaf6fd96d039b0bba1d3a7055c..9f448d3a1602c503093214201bdc96fc9bee85b5 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Common TFGAN summaries."""
+"""Common TF-GAN summaries."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_test.py b/tensorflow/contrib/gan/python/eval/python/summaries_test.py
index 54a6f8d4d9086ad7fc8db31032677628561e48e8..53fc7cb8ede698c2d8590c7fd3016a884cef9be9 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for TFGAN summaries."""
+"""Tests for TF-GAN summaries."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/features/__init__.py b/tensorflow/contrib/gan/python/features/__init__.py
index 4816daf760143af9f1502873b123ffad8e5ec8ce..410c3a02052cd3a07a36a0ba332a80b3c2705d89 100644
--- a/tensorflow/contrib/gan/python/features/__init__.py
+++ b/tensorflow/contrib/gan/python/features/__init__.py
@@ -27,11 +27,13 @@ from __future__ import print_function
 from tensorflow.contrib.gan.python.features.python import clip_weights
 from tensorflow.contrib.gan.python.features.python import conditioning_utils
 from tensorflow.contrib.gan.python.features.python import random_tensor_pool
+from tensorflow.contrib.gan.python.features.python import spectral_normalization
 from tensorflow.contrib.gan.python.features.python import virtual_batchnorm
 
 from tensorflow.contrib.gan.python.features.python.clip_weights import *
 from tensorflow.contrib.gan.python.features.python.conditioning_utils import *
 from tensorflow.contrib.gan.python.features.python.random_tensor_pool import *
+from tensorflow.contrib.gan.python.features.python.spectral_normalization import *
 from tensorflow.contrib.gan.python.features.python.virtual_batchnorm import *
 # pylint: enable=unused-import,wildcard-import
 
@@ -40,5 +42,6 @@ from tensorflow.python.util.all_util import remove_undocumented
 _allowed_symbols = clip_weights.__all__
 _allowed_symbols += conditioning_utils.__all__
 _allowed_symbols += random_tensor_pool.__all__
+_allowed_symbols += spectral_normalization.__all__
 _allowed_symbols += virtual_batchnorm.__all__
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/gan/python/features/python/spectral_normalization.py b/tensorflow/contrib/gan/python/features/python/spectral_normalization.py
new file mode 100644
index 0000000000000000000000000000000000000000..54d3d0a218dec3588844333cd47e1f92489d8df9
--- /dev/null
+++ b/tensorflow/contrib/gan/python/features/python/spectral_normalization.py
@@ -0,0 +1,32 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras-like layers and utilities that implement Spectral Normalization.
+
+Based on "Spectral Normalization for Generative Adversarial Networks" by Miyato,
+et al in ICLR 2018. https://openreview.net/pdf?id=B1QRgziT-
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.gan.python.features.python import spectral_normalization_impl
+# pylint: disable=wildcard-import
+from tensorflow.contrib.gan.python.features.python.spectral_normalization_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+__all__ = spectral_normalization_impl.__all__
+remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/features/python/spectral_normalization_impl.py b/tensorflow/contrib/gan/python/features/python/spectral_normalization_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cc653f0a7907f407e66add5537d1e0a5adb6d8b
--- /dev/null
+++ b/tensorflow/contrib/gan/python/features/python/spectral_normalization_impl.py
@@ -0,0 +1,315 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras-like layers and utilities that implement Spectral Normalization.
+
+Based on "Spectral Normalization for Generative Adversarial Networks" by Miyato,
+et al in ICLR 2018. https://openreview.net/pdf?id=B1QRgziT-
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import numbers
+import re
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.engine import base_layer_utils as keras_base_layer_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import tf_logging as logging
+
+__all__ = [
+    'compute_spectral_norm', 'spectral_normalize', 'spectral_norm_regularizer',
+    'spectral_normalization_custom_getter', 'keras_spectral_normalization'
+]
+
+# tf.bfloat16 should work, but tf.matmul converts those to tf.float32 which then
+# can't directly be assigned back to the tf.bfloat16 variable.
+_OK_DTYPES_FOR_SPECTRAL_NORM = (dtypes.float16, dtypes.float32, dtypes.float64)
+_PERSISTED_U_VARIABLE_SUFFIX = 'spectral_norm_u'
+
+
+def compute_spectral_norm(w_tensor, power_iteration_rounds=1, name=None):
+  """Estimates the largest singular value in the weight tensor.
+
+  Args:
+    w_tensor: The weight matrix whose spectral norm should be computed.
+    power_iteration_rounds: The number of iterations of the power method to
+      perform. A higher number yeilds a better approximation.
+    name: An optional scope name.
+
+  Returns:
+    The largest singular value (the spectral norm) of w.
+  """
+  with variable_scope.variable_scope(name, 'spectral_norm'):
+    # The paper says to flatten convnet kernel weights from
+    # (C_out, C_in, KH, KW) to (C_out, C_in * KH * KW). But TensorFlow's Conv2D
+    # kernel weight shape is (KH, KW, C_in, C_out), so it should be reshaped to
+    # (KH * KW * C_in, C_out), and similarly for other layers that put output
+    # channels as last dimension.
+    # n.b. this means that w here is equivalent to w.T in the paper.
+    w = array_ops.reshape(w_tensor, (-1, w_tensor.get_shape()[-1]))
+
+    # Persisted approximation of first left singular vector of matrix `w`.
+    u_var = variable_scope.get_variable(
+        _PERSISTED_U_VARIABLE_SUFFIX,
+        shape=(w.shape[0], 1),
+        dtype=w.dtype,
+        initializer=init_ops.random_normal_initializer(),
+        trainable=False)
+    u = u_var
+
+    # Use power iteration method to approximate spectral norm.
+    for _ in range(power_iteration_rounds):
+      # `v` approximates the first right singular vector of matrix `w`.
+      v = nn.l2_normalize(math_ops.matmul(array_ops.transpose(w), u))
+      u = nn.l2_normalize(math_ops.matmul(w, v))
+
+    # Update persisted approximation.
+    with ops.control_dependencies([u_var.assign(u, name='update_u')]):
+      u = array_ops.identity(u)
+
+    u = array_ops.stop_gradient(u)
+    v = array_ops.stop_gradient(v)
+
+    # Largest singular value of `w`.
+    spectral_norm = math_ops.matmul(
+        math_ops.matmul(array_ops.transpose(u), w), v)
+    spectral_norm.shape.assert_is_fully_defined()
+    spectral_norm.shape.assert_is_compatible_with([1, 1])
+
+    return spectral_norm[0][0]
+
+
+def spectral_normalize(w, power_iteration_rounds=1, name=None):
+  """Normalizes a weight matrix by its spectral norm.
+
+  Args:
+    w: The weight matrix to be normalized.
+    power_iteration_rounds: The number of iterations of the power method to
+      perform. A higher number yeilds a better approximation.
+    name: An optional scope name.
+
+  Returns:
+    A normalized weight matrix tensor.
+  """
+  with variable_scope.variable_scope(name, 'spectral_normalize'):
+    w_normalized = w / compute_spectral_norm(
+        w, power_iteration_rounds=power_iteration_rounds)
+    return array_ops.reshape(w_normalized, w.get_shape())
+
+
+def spectral_norm_regularizer(scale, power_iteration_rounds=1, scope=None):
+  """Returns a functions that can be used to apply spectral norm regularization.
+
+  Small spectral norms enforce a small Lipschitz constant, which is necessary
+  for Wasserstein GANs.
+
+  Args:
+    scale: A scalar multiplier. 0.0 disables the regularizer.
+    power_iteration_rounds: The number of iterations of the power method to
+      perform. A higher number yeilds a better approximation.
+    scope: An optional scope name.
+
+  Returns:
+    A function with the signature `sn(weights)` that applies spectral norm
+    regularization.
+
+  Raises:
+    ValueError: If scale is negative or if scale is not a float.
+  """
+  if isinstance(scale, numbers.Integral):
+    raise ValueError('scale cannot be an integer: %s' % scale)
+  if isinstance(scale, numbers.Real):
+    if scale < 0.0:
+      raise ValueError(
+          'Setting a scale less than 0 on a regularizer: %g' % scale)
+    if scale == 0.0:
+      logging.info('Scale of 0 disables regularizer.')
+      return lambda _: None
+
+  def sn(weights, name=None):
+    """Applies spectral norm regularization to weights."""
+    with ops.name_scope(scope, 'SpectralNormRegularizer', [weights]) as name:
+      scale_t = ops.convert_to_tensor(
+          scale, dtype=weights.dtype.base_dtype, name='scale')
+      return math_ops.multiply(
+          scale_t,
+          compute_spectral_norm(
+              weights, power_iteration_rounds=power_iteration_rounds),
+          name=name)
+
+  return sn
+
+
+def _default_name_filter(name):
+  """A filter function to identify common names of weight variables.
+
+  Args:
+    name: The variable name.
+
+  Returns:
+    Whether `name` is a standard name for a weight/kernel variables used in the
+    Keras, tf.layers, tf.contrib.layers or tf.contrib.slim libraries.
+  """
+  match = re.match(r'(.*\/)?(depthwise_|pointwise_)?(weights|kernel)$', name)
+  return match is not None
+
+
+def spectral_normalization_custom_getter(name_filter=_default_name_filter,
+                                         power_iteration_rounds=1):
+  """Custom getter that performs Spectral Normalization on a weight tensor.
+
+  Specifically it divides the weight tensor by its largest singular value. This
+  is intended to stabilize GAN training, by making the discriminator satisfy a
+  local 1-Lipschitz constraint.
+
+  Based on [Spectral Normalization for Generative Adversarial Networks][sn-gan].
+
+  [sn-gan]: https://openreview.net/forum?id=B1QRgziT-
+
+  To reproduce an SN-GAN, apply this custom_getter to every weight tensor of
+  your discriminator. The last dimension of the weight tensor must be the number
+  of output channels.
+
+  Apply this to layers by supplying this as the `custom_getter` of a
+  `tf.variable_scope`. For example:
+
+    with tf.variable_scope('discriminator',
+                           custom_getter=spectral_norm_getter()):
+      net = discriminator_fn(net)
+
+  IMPORTANT: Keras does not respect the custom_getter supplied by the
+  VariableScope, so Keras users should use `keras_spectral_normalization`
+  instead of (or in addition to) this approach.
+
+  It is important to carefully select to which weights you want to apply
+  Spectral Normalization. In general you want to normalize the kernels of
+  convolution and dense layers, but you do not want to normalize biases. You
+  also want to avoid normalizing batch normalization (and similar) variables,
+  but in general such layers play poorly with Spectral Normalization, since the
+  gamma can cancel out the normalization in other layers. By default we supply a
+  filter that matches the kernel variable names of the dense and convolution
+  layers of the tf.layers, tf.contrib.layers, tf.keras and tf.contrib.slim
+  libraries. If you are using anything else you'll need a custom `name_filter`.
+
+  This custom getter internally creates a variable used to compute the spectral
+  norm by power iteration. It will update every time the variable is accessed,
+  which means the normalized discriminator weights may change slightly whilst
+  training the generator. Whilst unusual, this matches how the paper's authors
+  implement it, and in general additional rounds of power iteration can't hurt.
+
+  Args:
+    name_filter: Optionally, a method that takes a Variable name as input and
+      returns whether this Variable should be normalized.
+    power_iteration_rounds: The number of iterations of the power method to
+      perform per step. A higher number yeilds a better approximation of the
+      true spectral norm.
+
+  Returns:
+    A custom getter function that applies Spectral Normalization to all
+    Variables whose names match `name_filter`.
+
+  Raises:
+    ValueError: If name_filter is not callable.
+  """
+  if not callable(name_filter):
+    raise ValueError('name_filter must be callable')
+
+  def _internal_getter(getter, name, *args, **kwargs):
+    """A custom getter function that applies Spectral Normalization.
+
+    Args:
+      getter: The true getter to call.
+      name: Name of new/existing variable, in the same format as
+        tf.get_variable.
+      *args: Other positional arguments, in the same format as tf.get_variable.
+      **kwargs: Keyword arguments, in the same format as tf.get_variable.
+
+    Returns:
+      The return value of `getter(name, *args, **kwargs)`, spectrally
+      normalized.
+
+    Raises:
+      ValueError: If used incorrectly, or if `dtype` is not supported.
+    """
+    if not name_filter(name):
+      return getter(name, *args, **kwargs)
+
+    if name.endswith(_PERSISTED_U_VARIABLE_SUFFIX):
+      raise ValueError(
+          'Cannot apply Spectral Normalization to internal variables created '
+          'for Spectral Normalization. Tried to normalized variable [%s]' %
+          name)
+
+    if kwargs['dtype'] not in _OK_DTYPES_FOR_SPECTRAL_NORM:
+      raise ValueError('Disallowed data type {}'.format(kwargs['dtype']))
+
+    # This layer's weight Variable/PartitionedVariable.
+    w_tensor = getter(name, *args, **kwargs)
+
+    if len(w_tensor.get_shape()) < 2:
+      raise ValueError(
+          'Spectral norm can only be applied to multi-dimensional tensors')
+
+    return spectral_normalize(
+        w_tensor,
+        power_iteration_rounds=power_iteration_rounds,
+        name=(name + '/spectral_normalize'))
+
+  return _internal_getter
+
+
+@contextlib.contextmanager
+def keras_spectral_normalization(name_filter=_default_name_filter,
+                                 power_iteration_rounds=1):
+  """A context manager that enables Spectral Normalization for Keras.
+
+  Keras doesn't respect the `custom_getter` in the VariableScope, so this is a
+  bit of a hack to make things work.
+
+  Usage:
+    with keras_spectral_normalization():
+      net = discriminator_fn(net)
+
+  Args:
+    name_filter: Optionally, a method that takes a Variable name as input and
+      returns whether this Variable should be normalized.
+    power_iteration_rounds: The number of iterations of the power method to
+      perform per step. A higher number yeilds a better approximation of the
+      true spectral norm.
+
+  Yields:
+    A context manager that wraps the standard Keras variable creation method
+    with the `spectral_normalization_custom_getter`.
+  """
+  original_make_variable = keras_base_layer_utils.make_variable
+  sn_getter = spectral_normalization_custom_getter(
+      name_filter=name_filter, power_iteration_rounds=power_iteration_rounds)
+
+  def make_variable_wrapper(name, *args, **kwargs):
+    return sn_getter(original_make_variable, name, *args, **kwargs)
+
+  keras_base_layer_utils.make_variable = make_variable_wrapper
+
+  yield
+
+  keras_base_layer_utils.make_variable = original_make_variable
diff --git a/tensorflow/contrib/gan/python/features/python/spectral_normalization_test.py b/tensorflow/contrib/gan/python/features/python/spectral_normalization_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ea21f70ec01950cfef5e4fa851c78b219d6062f
--- /dev/null
+++ b/tensorflow/contrib/gan/python/features/python/spectral_normalization_test.py
@@ -0,0 +1,354 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for features.spectral_normalization."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib import slim
+from tensorflow.contrib.gan.python.features.python import spectral_normalization_impl as spectral_normalization
+from tensorflow.contrib.layers.python.layers import layers as contrib_layers
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.layers import convolutional as keras_convolutional
+from tensorflow.python.keras.layers import core as keras_core
+from tensorflow.python.layers import convolutional as layers_convolutional
+from tensorflow.python.layers import core as layers_core
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class SpectralNormalizationTest(test.TestCase):
+
+  def testComputeSpectralNorm(self):
+    weights = variable_scope.get_variable(
+        'w', dtype=dtypes.float32, shape=[2, 3, 50, 100])
+    weights = math_ops.multiply(weights, 10.0)
+    s = linalg_ops.svd(
+        array_ops.reshape(weights, [-1, weights.shape[-1]]), compute_uv=False)
+    true_sn = s[..., 0]
+    estimated_sn = spectral_normalization.compute_spectral_norm(weights)
+
+    with self.cached_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      np_true_sn = sess.run(true_sn)
+      for i in range(50):
+        est = sess.run(estimated_sn)
+        if i < 1:
+          np_est_1 = est
+        if i < 4:
+          np_est_5 = est
+        if i < 9:
+          np_est_10 = est
+        np_est_50 = est
+
+      # Check that the estimate improves with more iterations.
+      self.assertAlmostEqual(np_true_sn, np_est_50, 0)
+      self.assertGreater(
+          abs(np_true_sn - np_est_10), abs(np_true_sn - np_est_50))
+      self.assertGreater(
+          abs(np_true_sn - np_est_5), abs(np_true_sn - np_est_10))
+      self.assertGreater(abs(np_true_sn - np_est_1), abs(np_true_sn - np_est_5))
+
+  def testSpectralNormalize(self):
+    weights = variable_scope.get_variable(
+        'w', dtype=dtypes.float32, shape=[2, 3, 50, 100])
+    weights = math_ops.multiply(weights, 10.0)
+    normalized_weights = spectral_normalization.spectral_normalize(
+        weights, power_iteration_rounds=1)
+
+    unnormalized_sigma = linalg_ops.svd(
+        array_ops.reshape(weights, [-1, weights.shape[-1]]),
+        compute_uv=False)[..., 0]
+    normalized_sigma = linalg_ops.svd(
+        array_ops.reshape(normalized_weights, [-1, weights.shape[-1]]),
+        compute_uv=False)[..., 0]
+
+    with self.cached_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      s0 = sess.run(unnormalized_sigma)
+
+      for i in range(50):
+        sigma = sess.run(normalized_sigma)
+        if i < 1:
+          s1 = sigma
+        if i < 5:
+          s5 = sigma
+        if i < 10:
+          s10 = sigma
+        s50 = sigma
+
+      self.assertAlmostEqual(1., s50, 0)
+      self.assertGreater(abs(s10 - 1.), abs(s50 - 1.))
+      self.assertGreater(abs(s5 - 1.), abs(s10 - 1.))
+      self.assertGreater(abs(s1 - 1.), abs(s5 - 1.))
+      self.assertGreater(abs(s0 - 1.), abs(s1 - 1.))
+
+  def _testLayerHelper(self, build_layer_fn, w_shape, b_shape, is_keras=False):
+    x = array_ops.placeholder(dtypes.float32, shape=[2, 10, 10, 3])
+
+    w_initial = np.random.randn(*w_shape) * 10
+    w_initializer = init_ops.constant_initializer(w_initial)
+    b_initial = np.random.randn(*b_shape)
+    b_initializer = init_ops.constant_initializer(b_initial)
+
+    if is_keras:
+      context_manager = spectral_normalization.keras_spectral_normalization()
+    else:
+      getter = spectral_normalization.spectral_normalization_custom_getter()
+      context_manager = variable_scope.variable_scope('', custom_getter=getter)
+
+    with context_manager:
+      (net,
+       expected_normalized_vars, expected_not_normalized_vars) = build_layer_fn(
+           x, w_initializer, b_initializer)
+
+    x_data = np.random.rand(*x.shape)
+
+    with self.cached_session() as sess:
+      sess.run(variables.global_variables_initializer())
+
+      # Before running a forward pass we still expect the variables values to
+      # differ from the initial value because of the normalizer.
+      w_befores = []
+      for name, var in expected_normalized_vars.items():
+        w_before = sess.run(var)
+        w_befores.append(w_before)
+        self.assertFalse(
+            np.allclose(w_initial, w_before),
+            msg=('%s appears not to be normalized. Before: %s After: %s' %
+                 (name, w_initial, w_before)))
+
+      # Not true for the unnormalized variables.
+      for name, var in expected_not_normalized_vars.items():
+        b_before = sess.run(var)
+        self.assertTrue(
+            np.allclose(b_initial, b_before),
+            msg=('%s appears to be unexpectedly normalized. '
+                 'Before: %s After: %s' % (name, b_initial, b_before)))
+
+      # Run a bunch of forward passes.
+      for _ in range(1000):
+        _ = sess.run(net, feed_dict={x: x_data})
+
+      # We expect this to have improved the estimate of the spectral norm,
+      # which should have changed the variable values and brought them close
+      # to the true Spectral Normalized values.
+      _, s, _ = np.linalg.svd(w_initial.reshape([-1, 3]))
+      exactly_normalized = w_initial / s[0]
+      for w_before, (name, var) in zip(w_befores,
+                                       expected_normalized_vars.items()):
+        w_after = sess.run(var)
+        self.assertFalse(
+            np.allclose(w_before, w_after, rtol=1e-8, atol=1e-8),
+            msg=('%s did not improve over many iterations. '
+                 'Before: %s After: %s' % (name, w_before, w_after)))
+        self.assertAllClose(
+            exactly_normalized,
+            w_after,
+            rtol=1e-4,
+            atol=1e-4,
+            msg=('Estimate of spectral norm for %s was innacurate. '
+                 'Normalized matrices do not match.'
+                 'Estimate: %s Actual: %s' % (name, w_after,
+                                              exactly_normalized)))
+
+  def testConv2D_Layers(self):
+
+    def build_layer_fn(x, w_initializer, b_initializer):
+      layer = layers_convolutional.Conv2D(
+          filters=3,
+          kernel_size=3,
+          padding='same',
+          kernel_initializer=w_initializer,
+          bias_initializer=b_initializer)
+      net = layer.apply(x)
+      expected_normalized_vars = {'tf.layers.Conv2d.kernel': layer.kernel}
+      expected_not_normalized_vars = {'tf.layers.Conv2d.bias': layer.bias}
+
+      return net, expected_normalized_vars, expected_not_normalized_vars
+
+    self._testLayerHelper(build_layer_fn, (3, 3, 3, 3), (3,))
+
+  def testConv2D_ContribLayers(self):
+
+    def build_layer_fn(x, w_initializer, b_initializer):
+      var_collection = {
+          'weights': ['CONTRIB_LAYERS_CONV2D_WEIGHTS'],
+          'biases': ['CONTRIB_LAYERS_CONV2D_BIASES']
+      }
+      net = contrib_layers.conv2d(
+          x,
+          3,
+          3,
+          weights_initializer=w_initializer,
+          biases_initializer=b_initializer,
+          variables_collections=var_collection)
+      weight_vars = ops.get_collection('CONTRIB_LAYERS_CONV2D_WEIGHTS')
+      self.assertEquals(1, len(weight_vars))
+      bias_vars = ops.get_collection('CONTRIB_LAYERS_CONV2D_BIASES')
+      self.assertEquals(1, len(bias_vars))
+      expected_normalized_vars = {
+          'contrib.layers.conv2d.weights': weight_vars[0]
+      }
+      expected_not_normalized_vars = {
+          'contrib.layers.conv2d.bias': bias_vars[0]
+      }
+
+      return net, expected_normalized_vars, expected_not_normalized_vars
+
+    self._testLayerHelper(build_layer_fn, (3, 3, 3, 3), (3,))
+
+  def testConv2D_Slim(self):
+
+    def build_layer_fn(x, w_initializer, b_initializer):
+      var_collection = {
+          'weights': ['SLIM_CONV2D_WEIGHTS'],
+          'biases': ['SLIM_CONV2D_BIASES']
+      }
+      net = slim.conv2d(
+          x,
+          3,
+          3,
+          weights_initializer=w_initializer,
+          biases_initializer=b_initializer,
+          variables_collections=var_collection)
+      weight_vars = ops.get_collection('SLIM_CONV2D_WEIGHTS')
+      self.assertEquals(1, len(weight_vars))
+      bias_vars = ops.get_collection('SLIM_CONV2D_BIASES')
+      self.assertEquals(1, len(bias_vars))
+      expected_normalized_vars = {'slim.conv2d.weights': weight_vars[0]}
+      expected_not_normalized_vars = {'slim.conv2d.bias': bias_vars[0]}
+
+      return net, expected_normalized_vars, expected_not_normalized_vars
+
+    self._testLayerHelper(build_layer_fn, (3, 3, 3, 3), (3,))
+
+  def testConv2D_Keras(self):
+
+    def build_layer_fn(x, w_initializer, b_initializer):
+      layer = keras_convolutional.Conv2D(
+          filters=3,
+          kernel_size=3,
+          padding='same',
+          kernel_initializer=w_initializer,
+          bias_initializer=b_initializer)
+      net = layer.apply(x)
+      expected_normalized_vars = {'keras.layers.Conv2d.kernel': layer.kernel}
+      expected_not_normalized_vars = {'keras.layers.Conv2d.bias': layer.bias}
+
+      return net, expected_normalized_vars, expected_not_normalized_vars
+
+    self._testLayerHelper(build_layer_fn, (3, 3, 3, 3), (3,), is_keras=True)
+
+  def testFC_Layers(self):
+
+    def build_layer_fn(x, w_initializer, b_initializer):
+      x = layers_core.Flatten()(x)
+      layer = layers_core.Dense(
+          units=3,
+          kernel_initializer=w_initializer,
+          bias_initializer=b_initializer)
+      net = layer.apply(x)
+      expected_normalized_vars = {'tf.layers.Dense.kernel': layer.kernel}
+      expected_not_normalized_vars = {'tf.layers.Dense.bias': layer.bias}
+
+      return net, expected_normalized_vars, expected_not_normalized_vars
+
+    self._testLayerHelper(build_layer_fn, (300, 3), (3,))
+
+  def testFC_ContribLayers(self):
+
+    def build_layer_fn(x, w_initializer, b_initializer):
+      var_collection = {
+          'weights': ['CONTRIB_LAYERS_FC_WEIGHTS'],
+          'biases': ['CONTRIB_LAYERS_FC_BIASES']
+      }
+      x = contrib_layers.flatten(x)
+      net = contrib_layers.fully_connected(
+          x,
+          3,
+          weights_initializer=w_initializer,
+          biases_initializer=b_initializer,
+          variables_collections=var_collection)
+      weight_vars = ops.get_collection('CONTRIB_LAYERS_FC_WEIGHTS')
+      self.assertEquals(1, len(weight_vars))
+      bias_vars = ops.get_collection('CONTRIB_LAYERS_FC_BIASES')
+      self.assertEquals(1, len(bias_vars))
+      expected_normalized_vars = {
+          'contrib.layers.fully_connected.weights': weight_vars[0]
+      }
+      expected_not_normalized_vars = {
+          'contrib.layers.fully_connected.bias': bias_vars[0]
+      }
+
+      return net, expected_normalized_vars, expected_not_normalized_vars
+
+    self._testLayerHelper(build_layer_fn, (300, 3), (3,))
+
+  def testFC_Slim(self):
+
+    def build_layer_fn(x, w_initializer, b_initializer):
+      var_collection = {
+          'weights': ['SLIM_FC_WEIGHTS'],
+          'biases': ['SLIM_FC_BIASES']
+      }
+      x = slim.flatten(x)
+      net = slim.fully_connected(
+          x,
+          3,
+          weights_initializer=w_initializer,
+          biases_initializer=b_initializer,
+          variables_collections=var_collection)
+      weight_vars = ops.get_collection('SLIM_FC_WEIGHTS')
+      self.assertEquals(1, len(weight_vars))
+      bias_vars = ops.get_collection('SLIM_FC_BIASES')
+      self.assertEquals(1, len(bias_vars))
+      expected_normalized_vars = {
+          'slim.fully_connected.weights': weight_vars[0]
+      }
+      expected_not_normalized_vars = {'slim.fully_connected.bias': bias_vars[0]}
+
+      return net, expected_normalized_vars, expected_not_normalized_vars
+
+    self._testLayerHelper(build_layer_fn, (300, 3), (3,))
+
+  def testFC_Keras(self):
+
+    def build_layer_fn(x, w_initializer, b_initializer):
+      x = keras_core.Flatten()(x)
+      layer = keras_core.Dense(
+          units=3,
+          kernel_initializer=w_initializer,
+          bias_initializer=b_initializer)
+      net = layer.apply(x)
+      expected_normalized_vars = {'keras.layers.Dense.kernel': layer.kernel}
+      expected_not_normalized_vars = {'keras.layers.Dense.bias': layer.bias}
+
+      return net, expected_normalized_vars, expected_not_normalized_vars
+
+    self._testLayerHelper(build_layer_fn, (300, 3), (3,), is_keras=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl.py b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
index a0a86c6337eefa756a209635faa70db686a36247..1f1ae2df4d6def618e86aced3296ac89c836eab7 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
@@ -28,7 +28,7 @@ wasserstein_gradient_penalty
 All losses must be able to accept 1D or 2D Tensors, so as to be compatible with
 patchGAN style losses (https://arxiv.org/abs/1611.07004).
 
-To make these losses usable in the TFGAN framework, please create a tuple
+To make these losses usable in the TF-GAN framework, please create a tuple
 version of the losses with `losses_utils.py`.
 """
 
@@ -38,6 +38,7 @@ from __future__ import print_function
 
 
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables_lib
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -69,6 +70,10 @@ __all__ = [
 ]
 
 
+def _to_float(tensor):
+  return math_ops.cast(tensor, dtypes.float32)
+
+
 # Wasserstein losses from `Wasserstein GAN` (https://arxiv.org/abs/1701.07875).
 def wasserstein_generator_loss(
     discriminator_gen_outputs,
@@ -98,7 +103,7 @@ def wasserstein_generator_loss(
   """
   with ops.name_scope(scope, 'generator_wasserstein_loss', (
       discriminator_gen_outputs, weights)) as scope:
-    discriminator_gen_outputs = math_ops.to_float(discriminator_gen_outputs)
+    discriminator_gen_outputs = _to_float(discriminator_gen_outputs)
 
     loss = - discriminator_gen_outputs
     loss = losses.compute_weighted_loss(
@@ -144,8 +149,8 @@ def wasserstein_discriminator_loss(
   with ops.name_scope(scope, 'discriminator_wasserstein_loss', (
       discriminator_real_outputs, discriminator_gen_outputs, real_weights,
       generated_weights)) as scope:
-    discriminator_real_outputs = math_ops.to_float(discriminator_real_outputs)
-    discriminator_gen_outputs = math_ops.to_float(discriminator_gen_outputs)
+    discriminator_real_outputs = _to_float(discriminator_real_outputs)
+    discriminator_gen_outputs = _to_float(discriminator_gen_outputs)
     discriminator_real_outputs.shape.assert_is_compatible_with(
         discriminator_gen_outputs.shape)
 
@@ -320,7 +325,7 @@ def wasserstein_gradient_penalty(
     generated_data: Output of the generator.
     generator_inputs: Exact argument to pass to the generator, which is used
       as optional conditioning to the discriminator.
-    discriminator_fn: A discriminator function that conforms to TFGAN API.
+    discriminator_fn: A discriminator function that conforms to TF-GAN API.
     discriminator_scope: If not `None`, reuse discriminators from this scope.
     epsilon: A small positive number added for numerical stability when
       computing the gradient norm.
@@ -647,7 +652,7 @@ def least_squares_generator_loss(
   """
   with ops.name_scope(scope, 'lsq_generator_loss',
                       (discriminator_gen_outputs, real_label)) as scope:
-    discriminator_gen_outputs = math_ops.to_float(discriminator_gen_outputs)
+    discriminator_gen_outputs = _to_float(discriminator_gen_outputs)
     loss = math_ops.squared_difference(
         discriminator_gen_outputs, real_label) / 2.0
     loss = losses.compute_weighted_loss(
@@ -702,8 +707,8 @@ def least_squares_discriminator_loss(
   """
   with ops.name_scope(scope, 'lsq_discriminator_loss',
                       (discriminator_gen_outputs, real_label)) as scope:
-    discriminator_real_outputs = math_ops.to_float(discriminator_real_outputs)
-    discriminator_gen_outputs = math_ops.to_float(discriminator_gen_outputs)
+    discriminator_real_outputs = _to_float(discriminator_real_outputs)
+    discriminator_gen_outputs = _to_float(discriminator_gen_outputs)
     discriminator_real_outputs.shape.assert_is_compatible_with(
         discriminator_gen_outputs.shape)
 
diff --git a/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py b/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py
index 221c70c38bd432a6be7f6cda9c6700aa2255821f..76e57df7f646547037b3461ac44f7ee5b971406c 100644
--- a/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py
+++ b/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN utilities for loss functions that accept GANModel namedtuples.
+"""TF-GAN utilities for loss functions that accept GANModel namedtuples.
 
 The losses and penalties in this file all correspond to losses in
 `losses_impl.py`. Losses in that file take individual arguments, whereas in this
diff --git a/tensorflow/contrib/gan/python/namedtuples.py b/tensorflow/contrib/gan/python/namedtuples.py
index 969b68449d9c82f9f9144a8657cd8932b38fd0f7..73dfee4fdeec87cf0bac5eb675fd02a64a9ad7f5 100644
--- a/tensorflow/contrib/gan/python/namedtuples.py
+++ b/tensorflow/contrib/gan/python/namedtuples.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Named tuples for TFGAN.
+"""Named tuples for TF-GAN.
 
-TFGAN training occurs in four steps, and each step communicates with the next
-step via one of these named tuples. At each step, you can either use a TFGAN
+TF-GAN training occurs in four steps, and each step communicates with the next
+step via one of these named tuples. At each step, you can either use a TF-GAN
 helper function in `train.py`, or you can manually construct a tuple.
 """
 
diff --git a/tensorflow/contrib/gan/python/train.py b/tensorflow/contrib/gan/python/train.py
index 4c7bee41b33ce1fee46d374ca5fd1c0b603762f9..f36a5d346e0f27fbbc480e876380db51ed559c09 100644
--- a/tensorflow/contrib/gan/python/train.py
+++ b/tensorflow/contrib/gan/python/train.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""The TFGAN project provides a lightweight GAN training/testing framework.
+"""The TF-GAN project provides a lightweight GAN training/testing framework.
 
 This file contains the core helper functions to create and train a GAN model.
 See the README or examples in `tensorflow_models` for details on how to use.
 
-TFGAN training occurs in four steps:
+TF-GAN training occurs in four steps:
 1) Create a model
 2) Add a loss
 3) Create train ops
@@ -645,9 +645,10 @@ def gan_loss(
         type(model))
 
   # Optionally create pooled model.
-  pooled_model = (
-      _tensor_pool_adjusted_model(model, tensor_pool_fn)
-      if tensor_pool_fn else model)
+  if tensor_pool_fn:
+    pooled_model = _tensor_pool_adjusted_model(model, tensor_pool_fn)
+  else:
+    pooled_model = model
 
   # Create standard losses.
   gen_loss = generator_loss_fn(model, add_summaries=add_summaries)
@@ -665,10 +666,11 @@ def gan_loss(
   if _use_aux_loss(mutual_information_penalty_weight):
     gen_info_loss = tfgan_losses.mutual_information_penalty(
         model, add_summaries=add_summaries)
-    dis_info_loss = (
-        gen_info_loss
-        if tensor_pool_fn is None else tfgan_losses.mutual_information_penalty(
-            pooled_model, add_summaries=add_summaries))
+    if tensor_pool_fn is None:
+      dis_info_loss = gen_info_loss
+    else:
+      dis_info_loss = tfgan_losses.mutual_information_penalty(
+          pooled_model, add_summaries=add_summaries)
     gen_loss += mutual_information_penalty_weight * gen_info_loss
     dis_loss += mutual_information_penalty_weight * dis_info_loss
   if _use_aux_loss(aux_cond_generator_weight):
@@ -929,7 +931,7 @@ def gan_train_ops(
     **kwargs):
   """Returns GAN train ops.
 
-  The highest-level call in TFGAN. It is composed of functions that can also
+  The highest-level call in TF-GAN. It is composed of functions that can also
   be called, should a user require more control over some part of the GAN
   training process.
 
diff --git a/tensorflow/contrib/kernel_methods/python/losses.py b/tensorflow/contrib/kernel_methods/python/losses.py
index 4ef0a66a52429233c6e6f70667a451466493629c..294a7d69a704b3c06ab9e30489af116929ab6c2a 100644
--- a/tensorflow/contrib/kernel_methods/python/losses.py
+++ b/tensorflow/contrib/kernel_methods/python/losses.py
@@ -34,7 +34,7 @@ def sparse_multiclass_hinge_loss(
     scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
     reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS):
-  """Adds Ops for computing the multiclass hinge loss.
+  r"""Adds Ops for computing the multiclass hinge loss.
 
   The implementation is based on the following paper:
   On the Algorithmic Implementation of Multiclass Kernel-based Vector Machines
diff --git a/tensorflow/contrib/kfac/README.md b/tensorflow/contrib/kfac/README.md
index 42b91d031375b8edb7e4f364ac91ffb74ef1f54b..19daffea6c7e4486499388314d0aaaa611e94218 100644
--- a/tensorflow/contrib/kfac/README.md
+++ b/tensorflow/contrib/kfac/README.md
@@ -1,3 +1,3 @@
 # K-FAC: Kronecker-Factored Approximate Curvature
 
-## KFAC moved to third_party/tensorflow_kfac.
+## KFAC moved to https://github.com/tensorflow/kfac.
diff --git a/tensorflow/contrib/layers/python/layers/normalization.py b/tensorflow/contrib/layers/python/layers/normalization.py
index 11033a2e9cb646c2e7cd2f45de1f751d88c6921a..76b03ff514821d3459f84c5f46a64d1134e0d4de 100644
--- a/tensorflow/contrib/layers/python/layers/normalization.py
+++ b/tensorflow/contrib/layers/python/layers/normalization.py
@@ -186,7 +186,7 @@ def group_norm(inputs,
 
   Args:
     inputs: A Tensor with at least 2 dimensions one which is channels. All
-     shape dimensions must be fully defined.
+     shape dimensions except for batch must be fully defined.
     groups: Integer. Divide the channels into this number of groups over which
       normalization statistics are computed. This number must be commensurate
       with the number of channels in `inputs`.
@@ -249,13 +249,21 @@ def group_norm(inputs,
   """
   # TODO(shlens): Support partially defined shapes for the inputs.
   inputs = ops.convert_to_tensor(inputs)
-  original_shape = inputs.shape
 
   if inputs.shape.ndims is None:
     raise ValueError('Inputs %s has undefined rank.' % inputs.name)
   if channels_axis > (inputs.shape.ndims - 1):
     raise ValueError('Axis is out of bounds.')
 
+  # Use dynamic shape for not fully defined dimensions in the inputs.
+  dyanmic_shape = array_ops.shape(inputs)
+  input_shape_list = []
+  for i, dim in enumerate(inputs.shape):
+    if dim.value is None:
+      input_shape_list.append(dyanmic_shape[i])
+    else:
+      input_shape_list.append(dim)
+
   # Standardize the channels_axis to be positive and identify # of channels.
   if channels_axis < 0:
     channels_axis = inputs.shape.ndims + channels_axis
@@ -289,8 +297,8 @@ def group_norm(inputs,
   # Determine axes before channels. Some examples of common image formats:
   #  'NCHW': before = [N], after = [HW]
   #  'NHWC': before = [NHW], after = []
-  axes_before_channels = inputs.shape.as_list()[:channels_axis]
-  axes_after_channels = inputs.shape.as_list()[channels_axis+1:]
+  axes_before_channels = input_shape_list[:channels_axis]
+  axes_after_channels = input_shape_list[channels_axis+1:]
 
   # Manually broadcast the parameters to conform to the number of groups.
   params_shape_broadcast = ([1] * len(axes_before_channels) +
@@ -369,7 +377,7 @@ def group_norm(inputs,
     outputs = inputs * gain + offset
 
     # Collapse the groups into the channel dimension.
-    outputs = array_ops.reshape(outputs, original_shape)
+    outputs = array_ops.reshape(outputs, input_shape_list)
 
     if activation_fn is not None:
       outputs = activation_fn(outputs)
diff --git a/tensorflow/contrib/layers/python/layers/normalization_test.py b/tensorflow/contrib/layers/python/layers/normalization_test.py
index c8d3c91b10dbe3b959e91182f9924b78352d370d..9a85084b239837ade87d8c778393ef8e885f5bdd 100644
--- a/tensorflow/contrib/layers/python/layers/normalization_test.py
+++ b/tensorflow/contrib/layers/python/layers/normalization_test.py
@@ -221,6 +221,15 @@ class GroupNormTest(test.TestCase):
       normalization.group_norm(inputs, channels_axis=-1,
                                reduction_axes=[-3, -2])
 
+  def testParamsShapeNotFullyDefinedBatchAxis(self):
+    height, width, groups = 3, 3, 4
+    inputs = array_ops.placeholder(dtypes.float32,
+                                   shape=(None, height, width, 2*groups))
+    output = normalization.group_norm(inputs, channels_axis=-1,
+                                      reduction_axes=[-3, -2], groups=groups)
+    self.assertListEqual([None, height, width, 2 * groups],
+                         output.shape.as_list())
+
   def testCreateOp(self):
     height, width, groups = 3, 3, 4
     images = random_ops.random_uniform((5, height, width, 2*groups), seed=1)
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py b/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py
index 5e90d1fa20535de3b5e25bc7ff8c3862cea5514c..318046733bf75a6d661d26f478118c8e944afe15 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py
@@ -174,7 +174,7 @@ class GeneratorIoTest(test.TestCase):
       return np.arange(32, 36)
 
     with self.cached_session():
-      with self.assertRaisesRegexp(TypeError, 'x\(\) must be generator'):
+      with self.assertRaisesRegexp(TypeError, r'x\(\) must be generator'):
         failing_input_fn = generator_io.generator_input_fn(
             generator, batch_size=2, shuffle=False, num_epochs=1)
         failing_input_fn()
@@ -185,7 +185,7 @@ class GeneratorIoTest(test.TestCase):
       yield np.arange(32, 36)
 
     with self.cached_session():
-      with self.assertRaisesRegexp(TypeError, 'x\(\) must yield dict'):
+      with self.assertRaisesRegexp(TypeError, r'x\(\) must yield dict'):
         failing_input_fn = generator_io.generator_input_fn(
             generator, batch_size=2, shuffle=False, num_epochs=1)
         failing_input_fn()
diff --git a/tensorflow/contrib/mpi_collectives/BUILD b/tensorflow/contrib/mpi_collectives/BUILD
index ecac06354d2ce796f2a6021cdf2370d7c30ccab7..a7be92a35e0d62a61f7923ac61bb2c1267d039c6 100644
--- a/tensorflow/contrib/mpi_collectives/BUILD
+++ b/tensorflow/contrib/mpi_collectives/BUILD
@@ -52,7 +52,6 @@ tf_custom_op_library(
     deps = [
         ":mpi_defines",
         ":mpi_message_proto_cc",
-        "//tensorflow/stream_executor:stream_executor_headers_lib",
         "//third_party/mpi",
     ],
 )
diff --git a/tensorflow/contrib/opt/python/training/adam_gs_optimizer.py b/tensorflow/contrib/opt/python/training/adam_gs_optimizer.py
index 3fb649ea82e79b3bc78a2da6d5c3e9a071adec6d..c5c9fc74deaf0171a33d0eb1b5c6f60b3aa5e533 100644
--- a/tensorflow/contrib/opt/python/training/adam_gs_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/adam_gs_optimizer.py
@@ -41,7 +41,7 @@ class AdamGSOptimizer(optimizer.Optimizer):
   def __init__(self, global_step=0, learning_rate=0.001,
                beta1=0.9, beta2=0.999, epsilon=1e-8,
                use_locking=False, name="Adam"):
-    """Construct a new Adam optimizer.
+    r"""Construct a new Adam optimizer.
 
     Branched from tf.train.AdamOptimizer. The only difference is to pass
     global step for computing beta1 and beta2 accumulators, instead of having
@@ -83,23 +83,20 @@ class AdamGSOptimizer(optimizer.Optimizer):
     Args:
       global_step: tensorflow variable indicating the step.
       learning_rate: A Tensor or a floating point value.  The learning rate.
-      beta1: A float value or a constant float tensor.
-        The exponential decay rate for the 1st moment estimates.
-      beta2: A float value or a constant float tensor.
-        The exponential decay rate for the 2nd moment estimates.
+      beta1: A float value or a constant float tensor. The exponential decay
+        rate for the 1st moment estimates.
+      beta2: A float value or a constant float tensor. The exponential decay
+        rate for the 2nd moment estimates.
       epsilon: A small constant for numerical stability. This epsilon is
         "epsilon hat" in the Kingma and Ba paper (in the formula just before
         Section 2.1), not the epsilon in Algorithm 1 of the paper.
       use_locking: If True use locks for update operations.
       name: Optional name for the operations created when applying gradients.
-        Defaults to "Adam".
-
-    @compatibility(eager)
-    When eager execution is enabled, `learning_rate`, `beta1`, `beta2`, and
-    `epsilon` can each be a callable that takes no arguments and returns the
-    actual value to use. This can be useful for changing these values across
-    different invocations of optimizer functions.
-    @end_compatibility
+        Defaults to "Adam".  @compatibility(eager) When eager execution is
+        enabled, `learning_rate`, `beta1`, `beta2`, and `epsilon` can each be a
+        callable that takes no arguments and returns the actual value to use.
+        This can be useful for changing these values across different
+        invocations of optimizer functions. @end_compatibility
     """
     super(AdamGSOptimizer, self).__init__(use_locking, name)
     self._lr = learning_rate
diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py
index 248ffb1f7eb5dc27112ddf9b8670344904065ed0..1b7800f324b908e3c88fe90d31a2a08cbbd5ccf2 100644
--- a/tensorflow/contrib/optimizer_v2/adam.py
+++ b/tensorflow/contrib/optimizer_v2/adam.py
@@ -36,7 +36,7 @@ class AdamOptimizer(optimizer_v2.OptimizerV2):
 
   def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
                use_locking=False, name="Adam"):
-    """Construct a new Adam optimizer.
+    r"""Construct a new Adam optimizer.
 
     Initialization:
 
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
index 7fb23abc38d9dc101204ed83808aebe5a8ef1e78..1323ed014c9e51e273491694fa44a8e36cc723d0 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -843,8 +843,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
       scale_loss_by_num_replicas = (
           distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN)
     if scale_loss_by_num_replicas:
-      num_replicas = \
-        distribute_ctx.get_distribution_strategy().num_replicas_in_sync
+      num_replicas = distribute_ctx.get_strategy().num_replicas_in_sync
       if num_replicas > 1:
         loss_value *= 1. / num_replicas
     return loss_value
diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index 44b232e0f2b26f16f0300e11cf2764e1157a0050..39b688596875ab1b208d97a5d6f9a5ee811674cb 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -227,7 +227,10 @@ tf_custom_op_library(
         "kernels/lstm_ops_gpu.cu.cc",
         "kernels/lstm_ops.h",
     ],
-    deps = ["//tensorflow/core/kernels:eigen_helpers"],
+    deps = [
+        "//tensorflow/core/kernels:eigen_contraction_kernel",
+        "//tensorflow/core/kernels:eigen_helpers",
+    ],
 )
 
 tf_gen_op_wrapper_py(
@@ -249,7 +252,10 @@ tf_custom_op_library(
         "kernels/gru_ops_gpu.cu.cc",
         "kernels/gru_ops.h",
     ],
-    deps = ["//tensorflow/core/kernels:eigen_helpers"],
+    deps = [
+        "//tensorflow/core/kernels:eigen_contraction_kernel",
+        "//tensorflow/core/kernels:eigen_helpers",
+    ],
 )
 
 tf_gen_op_wrapper_py(
@@ -346,6 +352,7 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/kernels:eigen_contraction_kernel",
         "//tensorflow/core/kernels:eigen_helpers",
         "//third_party/eigen3",
     ],
diff --git a/tensorflow/contrib/rnn/kernels/blas_gemm.h b/tensorflow/contrib/rnn/kernels/blas_gemm.h
index d37210d4b81203287fb633adc309688a35d093bb..12f3182a6a8878aa27ee143fa6405903e3fc4ef3 100644
--- a/tensorflow/contrib/rnn/kernels/blas_gemm.h
+++ b/tensorflow/contrib/rnn/kernels/blas_gemm.h
@@ -21,6 +21,10 @@ limitations under the License.
 #include "tensorflow/core/kernels/eigen_activations.h"
 #include "tensorflow/core/platform/types.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 namespace tensorflow {
 class OpKernelContext;
 namespace functor {
diff --git a/tensorflow/contrib/seq2seq/BUILD b/tensorflow/contrib/seq2seq/BUILD
index 18b56cd21942e28cb0dc3210df0bb04d55c1e16f..89176180ae0dd963bccc34aa2d0fc52be839dd3f 100644
--- a/tensorflow/contrib/seq2seq/BUILD
+++ b/tensorflow/contrib/seq2seq/BUILD
@@ -33,7 +33,6 @@ tf_custom_op_py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":beam_search_ops",
-        "//tensorflow/contrib/distributions:distributions_py",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/rnn:rnn_py",
         "//tensorflow/contrib/util:util_py",
@@ -59,7 +58,6 @@ tf_custom_op_py_library(
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/ops/distributions",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
index b7f9f3fb090356a1c8d2bfb5044712ff93e267ce..abcf71c61b6e6df9462bf06323b8b11d5cc0d9a8 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
@@ -34,8 +34,6 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.distributions import bernoulli
-from tensorflow.python.ops.distributions import categorical
 from tensorflow.python.platform import test
 # pylint: enable=g-import-not-at-top
 
@@ -517,7 +515,7 @@ class BasicDecoderTest(test.TestCase):
         vocabulary_size)
 
     # The sample function samples categorically from the logits.
-    sample_fn = lambda x: categorical.Categorical(logits=x).sample()
+    sample_fn = lambda x: helper_py.categorical_sample(logits=x)
     # The next inputs are a one-hot encoding of the sampled labels.
     next_inputs_fn = (
         lambda x: array_ops.one_hot(x, vocabulary_size, dtype=dtypes.float32))
@@ -599,7 +597,7 @@ class BasicDecoderTest(test.TestCase):
 
     # The sample function samples independent bernoullis from the logits.
     sample_fn = (
-        lambda x: bernoulli.Bernoulli(logits=x, dtype=dtypes.bool).sample())
+        lambda x: helper_py.bernoulli_sample(logits=x, dtype=dtypes.bool))
     # The next inputs are a one-hot encoding of the sampled labels.
     next_inputs_fn = math_ops.to_float
     end_fn = lambda sample_ids: sample_ids[:, end_token]
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py
index 5aa32b532ffcf5772f6ace26662f5e5471cf6923..41b2a53ca5b178be9b04446c81d832575e5ed75b 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py
@@ -14,80 +14,254 @@
 # ==============================================================================
 
 """Tests for contrib.seq2seq.python.seq2seq.loss_ops."""
-# pylint: disable=unused-import,g-bad-import-order
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-# pylint: enable=unused-import
 
 import numpy as np
 
 from tensorflow.contrib.seq2seq.python.ops import loss
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class LossTest(test.TestCase):
 
+  def setUp(self):
+    self.batch_size = 2
+    self.sequence_length = 3
+    self.number_of_classes = 5
+    logits = [
+        constant_op.constant(i + 0.5, shape=[self.batch_size,
+                                             self.number_of_classes])
+        for i in range(self.sequence_length)
+    ]
+    self.logits = array_ops.stack(logits, axis=1)
+    targets = [
+        constant_op.constant(i, dtypes.int32, shape=[self.batch_size])
+        for i in range(self.sequence_length)
+    ]
+    self.targets = array_ops.stack(targets, axis=1)
+    weights = [
+        constant_op.constant(1.0, shape=[self.batch_size])
+        for _ in range(self.sequence_length)
+    ]
+    self.weights = array_ops.stack(weights, axis=1)
+    # expected_loss = sparse_softmax_cross_entropy_with_logits(targets, logits)
+    # where targets = [0, 1, 2], and logits = [[0.5] * 5, [1.5] * 5, [2.5] * 5]
+    self.expected_loss = 1.60944
+
   def testSequenceLoss(self):
-    with self.session(use_gpu=True) as sess:
-      with variable_scope.variable_scope(
-          'root', initializer=init_ops.constant_initializer(0.5)):
-        batch_size = 2
-        sequence_length = 3
-        number_of_classes = 5
-        logits = [
-            constant_op.constant(
-                i + 0.5, shape=[batch_size, number_of_classes])
-            for i in range(sequence_length)
-        ]
-        logits = array_ops.stack(logits, axis=1)
-        targets = [
-            constant_op.constant(
-                i, dtypes.int32, shape=[batch_size])
-            for i in range(sequence_length)
-        ]
-        targets = array_ops.stack(targets, axis=1)
-        weights = [
-            constant_op.constant(
-                1.0, shape=[batch_size]) for i in range(sequence_length)
-        ]
-        weights = array_ops.stack(weights, axis=1)
-
-        average_loss_per_example = loss.sequence_loss(
-            logits, targets, weights,
-            average_across_timesteps=True,
-            average_across_batch=True)
-        res = sess.run(average_loss_per_example)
-        self.assertAllClose(1.60944, res)
-
-        average_loss_per_sequence = loss.sequence_loss(
-            logits, targets, weights,
-            average_across_timesteps=False,
-            average_across_batch=True)
-        res = sess.run(average_loss_per_sequence)
-        compare_per_sequence = np.ones((sequence_length)) * 1.60944
-        self.assertAllClose(compare_per_sequence, res)
-
-        average_loss_per_batch = loss.sequence_loss(
-            logits, targets, weights,
-            average_across_timesteps=True,
-            average_across_batch=False)
-        res = sess.run(average_loss_per_batch)
-        compare_per_batch = np.ones((batch_size)) * 1.60944
-        self.assertAllClose(compare_per_batch, res)
-
-        total_loss = loss.sequence_loss(
-            logits, targets, weights,
-            average_across_timesteps=False,
-            average_across_batch=False)
-        res = sess.run(total_loss)
-        compare_total = np.ones((batch_size, sequence_length)) * 1.60944
-        self.assertAllClose(compare_total, res)
+    with self.test_session(use_gpu=True):
+      average_loss_per_example = loss.sequence_loss(
+          self.logits, self.targets, self.weights,
+          average_across_timesteps=True,
+          average_across_batch=True)
+      res = self.evaluate(average_loss_per_example)
+      self.assertAllClose(self.expected_loss, res)
+
+      average_loss_per_sequence = loss.sequence_loss(
+          self.logits, self.targets, self.weights,
+          average_across_timesteps=False,
+          average_across_batch=True)
+      res = self.evaluate(average_loss_per_sequence)
+      compare_per_sequence = np.full((self.sequence_length), self.expected_loss)
+      self.assertAllClose(compare_per_sequence, res)
+
+      average_loss_per_batch = loss.sequence_loss(
+          self.logits, self.targets, self.weights,
+          average_across_timesteps=True,
+          average_across_batch=False)
+      res = self.evaluate(average_loss_per_batch)
+      compare_per_batch = np.full((self.batch_size), self.expected_loss)
+      self.assertAllClose(compare_per_batch, res)
+
+      total_loss = loss.sequence_loss(
+          self.logits, self.targets, self.weights,
+          average_across_timesteps=False,
+          average_across_batch=False)
+      res = self.evaluate(total_loss)
+      compare_total = np.full((self.batch_size, self.sequence_length),
+                              self.expected_loss)
+      self.assertAllClose(compare_total, res)
+
+  def testSequenceLossClass(self):
+    with self.test_session(use_gpu=True):
+      seq_loss = loss.SequenceLoss(average_across_timesteps=True,
+                                   average_across_batch=True,
+                                   sum_over_timesteps=False,
+                                   sum_over_batch=False)
+      average_loss_per_example = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_example)
+      self.assertAllClose(self.expected_loss, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=True,
+                                   sum_over_timesteps=False,
+                                   sum_over_batch=False)
+      average_loss_per_sequence = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_sequence)
+      compare_per_sequence = np.full((self.sequence_length), self.expected_loss)
+      self.assertAllClose(compare_per_sequence, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=True,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=False,
+                                   sum_over_batch=False)
+      average_loss_per_batch = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_batch)
+      compare_per_batch = np.full((self.batch_size), self.expected_loss)
+      self.assertAllClose(compare_per_batch, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=False,
+                                   sum_over_batch=False)
+      total_loss = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(total_loss)
+      compare_total = np.full((self.batch_size, self.sequence_length),
+                              self.expected_loss)
+      self.assertAllClose(compare_total, res)
+
+  def testSumReduction(self):
+    with self.test_session(use_gpu=True):
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=True,
+                                   sum_over_batch=True)
+      average_loss_per_example = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_example)
+      self.assertAllClose(self.expected_loss, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=False,
+                                   sum_over_batch=True)
+      average_loss_per_sequence = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_sequence)
+      compare_per_sequence = np.full((self.sequence_length), self.expected_loss)
+      self.assertAllClose(compare_per_sequence, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=True,
+                                   sum_over_batch=False)
+      average_loss_per_batch = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_batch)
+      compare_per_batch = np.full((self.batch_size), self.expected_loss)
+      self.assertAllClose(compare_per_batch, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=False,
+                                   sum_over_batch=False)
+      total_loss = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(total_loss)
+      compare_total = np.full((self.batch_size, self.sequence_length),
+                              self.expected_loss)
+      self.assertAllClose(compare_total, res)
+
+  def testWeightedSumReduction(self):
+    weights = [
+        constant_op.constant(1.0, shape=[self.batch_size])
+        for _ in range(self.sequence_length)
+    ]
+    # Make the last element in the sequence to have zero weights.
+    weights[-1] = constant_op.constant(0.0, shape=[self.batch_size])
+    self.weights = array_ops.stack(weights, axis=1)
+    with self.test_session(use_gpu=True):
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=True,
+                                   sum_over_batch=True)
+      average_loss_per_example = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_example)
+      self.assertAllClose(self.expected_loss, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=False,
+                                   sum_over_batch=True)
+      average_loss_per_sequence = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_sequence)
+      compare_per_sequence = np.full((self.sequence_length), self.expected_loss)
+      # The last element in every sequence are zeros, which will be filtered.
+      compare_per_sequence[-1] = 0.
+      self.assertAllClose(compare_per_sequence, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=True,
+                                   sum_over_batch=False)
+      average_loss_per_batch = seq_loss(self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_batch)
+      compare_per_batch = np.full((self.batch_size), self.expected_loss)
+      self.assertAllClose(compare_per_batch, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=False,
+                                   sum_over_batch=False)
+      total_loss = seq_loss(self.targets, self.logits, self.weights)
+      res = self.evaluate(total_loss)
+      compare_total = np.full((self.batch_size, self.sequence_length),
+                              self.expected_loss)
+      # The last element in every sequence are zeros, which will be filtered.
+      compare_total[:, -1] = 0
+      self.assertAllClose(compare_total, res)
+
+  def testZeroWeights(self):
+    weights = [
+        constant_op.constant(0.0, shape=[self.batch_size])
+        for _ in range(self.sequence_length)
+    ]
+    weights = array_ops.stack(weights, axis=1)
+    with self.test_session(use_gpu=True):
+      average_loss_per_example = loss.sequence_loss(
+          self.logits, self.targets, weights,
+          average_across_timesteps=True,
+          average_across_batch=True)
+      res = self.evaluate(average_loss_per_example)
+      self.assertAllClose(0.0, res)
+
+      average_loss_per_sequence = loss.sequence_loss(
+          self.logits, self.targets, weights,
+          average_across_timesteps=False,
+          average_across_batch=True)
+      res = self.evaluate(average_loss_per_sequence)
+      compare_per_sequence = np.zeros((self.sequence_length))
+      self.assertAllClose(compare_per_sequence, res)
+
+      average_loss_per_batch = loss.sequence_loss(
+          self.logits, self.targets, weights,
+          average_across_timesteps=True,
+          average_across_batch=False)
+      res = self.evaluate(average_loss_per_batch)
+      compare_per_batch = np.zeros((self.batch_size))
+      self.assertAllClose(compare_per_batch, res)
+
+      total_loss = loss.sequence_loss(
+          self.logits, self.targets, weights,
+          average_across_timesteps=False,
+          average_across_batch=False)
+      res = self.evaluate(total_loss)
+      compare_total = np.zeros((self.batch_size, self.sequence_length))
+      self.assertAllClose(compare_total, res)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/seq2seq/python/ops/helper.py b/tensorflow/contrib/seq2seq/python/ops/helper.py
index 3245cc5e72154289ea3ba000b9a30586a7ad03a9..033c2eb0801d5a51ee937f5e960faa91a6f1ae54 100644
--- a/tensorflow/contrib/seq2seq/python/ops/helper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/helper.py
@@ -32,9 +32,8 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import tensor_array_ops
-from tensorflow.python.ops.distributions import bernoulli
-from tensorflow.python.ops.distributions import categorical
 from tensorflow.python.util import nest
 
 __all__ = [
@@ -51,6 +50,68 @@ __all__ = [
 _transpose_batch_time = decoder._transpose_batch_time  # pylint: disable=protected-access
 
 
+# The following sample functions (_call_sampler, bernoulli_sample,
+# categorical_sample) mimic TensorFlow Probability distribution semantics.
+
+
+def _call_sampler(sample_n_fn, sample_shape, name=None):
+  """Reshapes vector of samples."""
+  with ops.name_scope(name, "call_sampler", values=[sample_shape]):
+    sample_shape = ops.convert_to_tensor(
+        sample_shape, dtype=dtypes.int32, name="sample_shape")
+    # Ensure sample_shape is a vector (vs just a scalar).
+    pad = math_ops.cast(math_ops.equal(array_ops.rank(sample_shape), 0),
+                        dtypes.int32)
+    sample_shape = array_ops.reshape(
+        sample_shape,
+        array_ops.pad(array_ops.shape(sample_shape),
+                      paddings=[[pad, 0]],
+                      constant_values=1))
+    samples = sample_n_fn(math_ops.reduce_prod(sample_shape))
+    batch_event_shape = array_ops.shape(samples)[1:]
+    final_shape = array_ops.concat([sample_shape, batch_event_shape], 0)
+    return array_ops.reshape(samples, final_shape)
+
+
+def bernoulli_sample(probs=None, logits=None, dtype=dtypes.int32,
+                     sample_shape=(), seed=None):
+  """Samples from Bernoulli distribution."""
+  if probs is None:
+    probs = math_ops.sigmoid(logits, name="probs")
+  else:
+    probs = ops.convert_to_tensor(probs, name="probs")
+  batch_shape_tensor = array_ops.shape(probs)
+  def _sample_n(n):
+    """Sample vector of Bernoullis."""
+    new_shape = array_ops.concat([[n], batch_shape_tensor], 0)
+    uniform = random_ops.random_uniform(
+        new_shape, seed=seed, dtype=probs.dtype)
+    return math_ops.cast(math_ops.less(uniform, probs), dtype)
+  return _call_sampler(_sample_n, sample_shape)
+
+
+def categorical_sample(logits, dtype=dtypes.int32,
+                       sample_shape=(), seed=None):
+  """Samples from categorical distribution."""
+  logits = ops.convert_to_tensor(logits, name="logits")
+  event_size = array_ops.shape(logits)[-1]
+  batch_shape_tensor = array_ops.shape(logits)[:-1]
+  def _sample_n(n):
+    """Sample vector of categoricals."""
+    if logits.shape.ndims == 2:
+      logits_2d = logits
+    else:
+      logits_2d = array_ops.reshape(logits, [-1, event_size])
+    sample_dtype = dtypes.int64 if logits.dtype.size > 4 else dtypes.int32
+    draws = random_ops.multinomial(
+        logits_2d, n, seed=seed, output_dtype=sample_dtype)
+    draws = array_ops.reshape(
+        array_ops.transpose(draws),
+        array_ops.concat([[n], batch_shape_tensor], 0))
+    return math_ops.cast(draws, dtype)
+  return _call_sampler(_sample_n, sample_shape)
+
+
 def _unstack_ta(inp):
   return tensor_array_ops.TensorArray(
       dtype=inp.dtype, size=array_ops.shape(inp)[0],
@@ -307,14 +368,14 @@ class ScheduledEmbeddingTrainingHelper(TrainingHelper):
     with ops.name_scope(name, "ScheduledEmbeddingTrainingHelperSample",
                         [time, outputs, state]):
       # Return -1s where we did not sample, and sample_ids elsewhere
-      select_sampler = bernoulli.Bernoulli(
-          probs=self._sampling_probability, dtype=dtypes.bool)
-      select_sample = select_sampler.sample(
-          sample_shape=self.batch_size, seed=self._scheduling_seed)
-      sample_id_sampler = categorical.Categorical(logits=outputs)
+      select_sample = bernoulli_sample(
+          probs=self._sampling_probability,
+          dtype=dtypes.bool,
+          sample_shape=self.batch_size,
+          seed=self._scheduling_seed)
       return array_ops.where(
           select_sample,
-          sample_id_sampler.sample(seed=self._seed),
+          categorical_sample(logits=outputs, seed=self._seed),
           gen_array_ops.fill([self.batch_size], -1))
 
   def next_inputs(self, time, outputs, state, sample_ids, name=None):
@@ -425,8 +486,10 @@ class ScheduledOutputTrainingHelper(TrainingHelper):
   def sample(self, time, outputs, state, name=None):
     with ops.name_scope(name, "ScheduledOutputTrainingHelperSample",
                         [time, outputs, state]):
-      sampler = bernoulli.Bernoulli(probs=self._sampling_probability)
-      return sampler.sample(sample_shape=self.batch_size, seed=self._seed)
+      return bernoulli_sample(
+          probs=self._sampling_probability,
+          sample_shape=self.batch_size,
+          seed=self._seed)
 
   def next_inputs(self, time, outputs, state, sample_ids, name=None):
     with ops.name_scope(name, "ScheduledOutputTrainingHelperNextInputs",
@@ -610,8 +673,7 @@ class SampleEmbeddingHelper(GreedyEmbeddingHelper):
     else:
       logits = outputs / self._softmax_temperature
 
-    sample_id_sampler = categorical.Categorical(logits=logits)
-    sample_ids = sample_id_sampler.sample(seed=self._seed)
+    sample_ids = categorical_sample(logits=logits, seed=self._seed)
 
     return sample_ids
 
diff --git a/tensorflow/contrib/seq2seq/python/ops/loss.py b/tensorflow/contrib/seq2seq/python/ops/loss.py
index 39a6d2f58b140706a94d83273d3327edd1891368..0fbfd6187030f14ac105a18b3e09b7a42d4de32a 100644
--- a/tensorflow/contrib/seq2seq/python/ops/loss.py
+++ b/tensorflow/contrib/seq2seq/python/ops/loss.py
@@ -20,11 +20,12 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.keras.losses import Loss
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 
-__all__ = ["sequence_loss"]
+__all__ = ["sequence_loss", "SequenceLoss"]
 
 
 def sequence_loss(logits,
@@ -32,16 +33,26 @@ def sequence_loss(logits,
                   weights,
                   average_across_timesteps=True,
                   average_across_batch=True,
+                  sum_over_timesteps=False,
+                  sum_over_batch=False,
                   softmax_loss_function=None,
                   name=None):
   """Weighted cross-entropy loss for a sequence of logits.
 
-  Depending on the values of `average_across_timesteps` and
-  `average_across_batch`, the return Tensor will have rank 0, 1, or 2 as these
-  arguments reduce the cross-entropy at each target, which has shape
-  `[batch_size, sequence_length]`, over their respective dimensions. For
-  example, if `average_across_timesteps` is `True` and `average_across_batch`
-  is `False`, then the return Tensor will have shape `[batch_size]`.
+  Depending on the values of `average_across_timesteps` / `sum_over_timesteps`
+  and `average_across_batch` / `sum_over_batch`, the return Tensor will have
+  rank 0, 1, or 2 as these arguments reduce the cross-entropy at each target,
+  which has shape `[batch_size, sequence_length]`, over their respective
+  dimensions. For example, if `average_across_timesteps` is `True` and
+  `average_across_batch` is `False`, then the return Tensor will have shape
+  `[batch_size]`.
+
+  Note that `average_across_timesteps` and `sum_over_timesteps` cannot be True
+  at same time. Same for `average_across_batch` and `sum_over_batch`.
+
+  The recommended loss reduction in tf 2.0 has been changed to sum_over, instead
+  of weighted average. User are recommend to use `sum_over_timesteps` and
+  `sum_over_batch` for reduction.
 
   Args:
     logits: A Tensor of shape
@@ -58,6 +69,12 @@ def sequence_loss(logits,
       dimension and divide the cost by the total label weight across timesteps.
     average_across_batch: If set, sum the cost across the batch dimension and
       divide the returned cost by the batch size.
+    sum_over_timesteps: If set, sum the cost across the sequence dimension and
+      divide the size of the sequence. Note that any element with 0 weights will
+      be excluded from size calculation.
+    sum_over_batch: if set, sum the cost across the batch dimension and divide
+      the total cost by the batch size. Not that any element with 0 weights will
+      be excluded from size calculation.
     softmax_loss_function: Function (labels, logits) -> loss-batch
       to be used instead of the standard softmax (the default if this is None).
       **Note that to avoid confusion, it is required for the function to accept
@@ -78,11 +95,15 @@ def sequence_loss(logits,
     raise ValueError("Logits must be a "
                      "[batch_size x sequence_length x logits] tensor")
   if len(targets.get_shape()) != 2:
-    raise ValueError("Targets must be a [batch_size x sequence_length] "
-                     "tensor")
+    raise ValueError("Targets must be a [batch_size x sequence_length] tensor")
   if len(weights.get_shape()) != 2:
-    raise ValueError("Weights must be a [batch_size x sequence_length] "
-                     "tensor")
+    raise ValueError("Weights must be a [batch_size x sequence_length] tensor")
+  if average_across_timesteps and sum_over_timesteps:
+    raise ValueError("average_across_timesteps and sum_over_timesteps cannot "
+                     "be set to True at same time.")
+  if average_across_batch and sum_over_batch:
+    raise ValueError("average_across_batch and sum_over_batch cannot be set "
+                     "to True at same time.")
   with ops.name_scope(name, "sequence_loss", [logits, targets, weights]):
     num_classes = array_ops.shape(logits)[2]
     logits_flat = array_ops.reshape(logits, [-1, num_classes])
@@ -96,20 +117,56 @@ def sequence_loss(logits,
     if average_across_timesteps and average_across_batch:
       crossent = math_ops.reduce_sum(crossent)
       total_size = math_ops.reduce_sum(weights)
-      total_size += 1e-12  # to avoid division by 0 for all-0 weights
-      crossent /= total_size
+      crossent = math_ops.div_no_nan(crossent, total_size)
+    elif sum_over_timesteps and sum_over_batch:
+      crossent = math_ops.reduce_sum(crossent)
+      total_count = math_ops.cast(math_ops.count_nonzero(weights),
+                                  crossent.dtype)
+      crossent = math_ops.div_no_nan(crossent, total_count)
     else:
-      batch_size = array_ops.shape(logits)[0]
-      sequence_length = array_ops.shape(logits)[1]
-      crossent = array_ops.reshape(crossent, [batch_size, sequence_length])
-    if average_across_timesteps and not average_across_batch:
-      crossent = math_ops.reduce_sum(crossent, axis=[1])
-      total_size = math_ops.reduce_sum(weights, axis=[1])
-      total_size += 1e-12  # to avoid division by 0 for all-0 weights
-      crossent /= total_size
-    if not average_across_timesteps and average_across_batch:
-      crossent = math_ops.reduce_sum(crossent, axis=[0])
-      total_size = math_ops.reduce_sum(weights, axis=[0])
-      total_size += 1e-12  # to avoid division by 0 for all-0 weights
-      crossent /= total_size
+      crossent = array_ops.reshape(crossent, array_ops.shape(logits)[0:2])
+      if average_across_timesteps or average_across_batch:
+        reduce_axis = [0] if average_across_batch else [1]
+        crossent = math_ops.reduce_sum(crossent, axis=reduce_axis)
+        total_size = math_ops.reduce_sum(weights, axis=reduce_axis)
+        crossent = math_ops.div_no_nan(crossent, total_size)
+      elif sum_over_timesteps or sum_over_batch:
+        reduce_axis = [0] if sum_over_batch else [1]
+        crossent = math_ops.reduce_sum(crossent, axis=reduce_axis)
+        total_count = math_ops.cast(
+            math_ops.count_nonzero(weights, axis=reduce_axis),
+            dtype=crossent.dtype)
+        crossent = math_ops.div_no_nan(crossent, total_count)
     return crossent
+
+
+class SequenceLoss(Loss):
+  """Weighted cross-entropy loss for a sequence of logits."""
+
+  def __init__(self,
+               average_across_timesteps=False,
+               average_across_batch=False,
+               sum_over_timesteps=True,
+               sum_over_batch=True,
+               softmax_loss_function=None,
+               name=None):
+    super(SequenceLoss, self).__init__(name=name)
+    self.average_across_timesteps = average_across_timesteps
+    self.average_across_batch = average_across_batch
+    self.sum_over_timesteps = sum_over_timesteps
+    self.sum_over_batch = sum_over_batch
+    self.softmax_loss_function = softmax_loss_function
+
+  def __call__(self, y_true, y_pred, sample_weight=None):
+    """Override the parent __call__ to have a customized reduce behavior."""
+    return sequence_loss(y_pred, y_true, sample_weight,
+                         average_across_timesteps=self.average_across_timesteps,
+                         average_across_batch=self.average_across_batch,
+                         sum_over_timesteps=self.sum_over_timesteps,
+                         sum_over_batch=self.sum_over_batch,
+                         softmax_loss_function=self.softmax_loss_function,
+                         name=self.name)
+
+  def call(self, y_true, y_pred):
+    # Skip this method since the __call__ contains real implementation.
+    pass
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
index d3edb43733761a906c6e5bf8b65f76e3e1ae56fc..3100a5a0e5da1103b61bd089cd433721686b9e72 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
@@ -32,7 +32,7 @@ class DecisionTreeResource : public ResourceBase {
   // Constructor.
   explicit DecisionTreeResource(const TensorForestParams& params);
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat("DecisionTree[size=",
                            decision_tree_->decision_tree().nodes_size(), "]");
   }
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h b/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
index eea0be27caf0a022ba7acaacd359c75a2df4eedb..44f2b3f473b9eced06bd800b9cf0a5a0825ec3eb 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
@@ -40,7 +40,7 @@ class FertileStatsResource : public ResourceBase {
     model_op_ = LeafModelOperatorFactory::CreateLeafModelOperator(params_);
   }
 
-  string DebugString() override { return "FertileStats"; }
+  string DebugString() const override { return "FertileStats"; }
 
   void ExtractFromProto(const FertileStats& stats);
 
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index da123e1623f9abc12dd7a44f3d7e4127740a62df..3d34b91a74f9e9f5b211fc60049f0749962d649a 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -491,6 +491,8 @@ cuda_py_tests(
         "test/binary_tensor_weight_broadcast_test.py",
         "test/concatenation_test.py",
         "test/const_broadcast_test.py",
+        "test/conv2d_test.py",
+        "test/identity_output_test.py",
         "test/manual_test.py",
         "test/memory_alignment_test.py",
         "test/multi_connection_neighbor_engine_test.py",
@@ -549,7 +551,6 @@ cuda_py_test(
     ],
     tags = [
         "no_cuda_on_cpu_tap",
-        "no_oss",  # TODO(b/121194394): re-enable in OSS after OOM is fixed.
         "no_pip",
         "no_tap",  # It is not able to download the mnist data.
         "no_windows",
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index bf2de94e04ae3f6817f7a679ce9fd88e750827dd..eef647473ab12a1425b8d7810bd1f39f8b818dc5 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -334,13 +334,12 @@ struct EdgePtrCompare {
 tensorflow::Status GetEngineInfo(
     const tensorflow::Graph* g,
     const tensorflow::grappler::GraphProperties& graph_properties,
-    const std::set<string>& segment_nodes,
+    const std::set<const Node*>& segment_nodes,
     const std::unordered_map<string, tensorflow::Node*>& node_map,
     const std::vector<tensorflow::Node*>& reverse_topo_order,
     EngineInfo* info) {
-  std::vector<int> subgraph_node_ids;  // Topologically sorted node ids.
-  std::set<string> subgraph_node_names = segment_nodes;
-  std::set<int> added_const_node_ids;  // Used to prevent double insertion.
+  std::vector<const Node*> subgraph_nodes;  // Topologically sorted nodes.
+  std::set<const Node*> added_const_nodes;  // Used to prevent double insertion.
   std::set<string> segment_devices;
 
   // Map from src_node_name+port to the unique port numbers of the TRT op, where
@@ -352,9 +351,8 @@ tensorflow::Status GetEngineInfo(
   std::unordered_map<string, int> input_to_engine_port, output_to_engine_port;
   for (auto it = reverse_topo_order.rbegin(); it != reverse_topo_order.rend();
        ++it) {
-    const auto& node_name = (*it)->name();
-    if (segment_nodes.count(node_name) == 0) continue;
-    auto node = *it;
+    const Node* node = *it;
+    if (segment_nodes.count(node) == 0) continue;
     auto node_device = node->requested_device();
     if (!node_device.empty()) {
       segment_devices.insert(node_device);
@@ -366,8 +364,11 @@ tensorflow::Status GetEngineInfo(
                 << " neither have requested device nor assigned device";
       }
     }
+    subgraph_nodes.push_back(node);
+
     const int node_id = node->id();
-    subgraph_node_ids.push_back(node_id);
+    const string& node_name = node->name();
+
     // Create input connections. Sort edges first to make determnistic since
     // in_edges is a set of pointers.
     std::vector<const tensorflow::Edge*> in_edges(node->in_edges().begin(),
@@ -375,7 +376,7 @@ tensorflow::Status GetEngineInfo(
     std::sort(in_edges.begin(), in_edges.end(), EdgePtrCompare());
     for (const auto edge : in_edges) {
       auto input_node = edge->src();
-      if (input_node->IsSource() || segment_nodes.count(input_node->name())) {
+      if (input_node->IsSource() || segment_nodes.count(input_node)) {
         continue;
       }
       if (edge->IsControlEdge()) {
@@ -392,12 +393,11 @@ tensorflow::Status GetEngineInfo(
         //
         // Note that the segmenter already ensure that the constant data input
         // is valid and suppported by the engine.
-        if (!added_const_node_ids.insert(input_node->id()).second) {
+        if (!added_const_nodes.insert(input_node).second) {
           // Already added before.
           continue;
         }
         VLOG(1) << "Adding const node " << input_node->name();
-        QCHECK(subgraph_node_names.insert(input_node->name()).second);
         // Since we already add (duplicate) the const input node to the segment
         // graphdef, it's now not a data dependency any more, but to make the
         // dependency correct we still add a control dependency.
@@ -428,7 +428,7 @@ tensorflow::Status GetEngineInfo(
     std::sort(out_edges.begin(), out_edges.end(), EdgePtrCompare());
     for (const auto edge : out_edges) {
       auto output_node = edge->dst();
-      if (output_node->IsSink() || segment_nodes.count(output_node->name())) {
+      if (output_node->IsSink() || segment_nodes.count(output_node)) {
         continue;
       }
       if (edge->IsControlEdge()) {
@@ -456,12 +456,11 @@ tensorflow::Status GetEngineInfo(
   }  // For each segment node in topological order.
 
   // Construct the const nodes first.
-  subgraph_node_ids.insert(subgraph_node_ids.begin(),
-                           added_const_node_ids.begin(),
-                           added_const_node_ids.end());
+  subgraph_nodes.insert(subgraph_nodes.begin(), added_const_nodes.begin(),
+                        added_const_nodes.end());
   TF_RETURN_IF_ERROR(ConvertSegmentToGraphDef(
-      g, graph_properties, subgraph_node_names, subgraph_node_ids,
-      &info->connections, &info->segment_graph_def, &info->engine_name));
+      g, graph_properties, subgraph_nodes, &info->connections,
+      &info->segment_graph_def, &info->engine_name));
   // TODO(sami): This should not happen once segmenter is updated.
   if (segment_devices.size() == 1) {
     info->device = *segment_devices.begin();
@@ -1033,27 +1032,31 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
     cudaSetDevice(cuda_device_id);
     auto status = CreateTRTNode(engine_segments, i, params.max_batch_size,
                                 &graph, alloc.get(), &engine_nodes);
-    // If status is ok, we successfully added the node to the graph and can
-    // remove segment ops. Otherwise graph is not modified.
+
     string msg = StrCat("TensorRT node ", engine.engine_name,
                         " added for segment ", i, " consisting of ",
                         converted_segments.at(i).first.size(), " nodes");
     if (status.ok()) {
       LOG(INFO) << msg << " succeeded.";
-      for (auto node_name : converted_segments.at(i).first) {
-        graph.RemoveNode(node_map.at(node_name));
-      }
     } else {
       // Graph is not modified.
       LOG(WARNING) << msg << " failed: " << status << ". Fallback to TF...";
     }
     if (VLOG_IS_ON(1)) {
       msg = "Segment consists of nodes: ";
-      for (const string& node_name : converted_segments.at(i).first) {
-        StrAppend(&msg, node_name, ", ");
+      for (const Node* node : converted_segments.at(i).first) {
+        StrAppend(&msg, node->name(), ", ");
       }
       VLOG(1) << msg;
     }
+
+    // If status is ok, we successfully added the node to the graph and can
+    // remove segment ops. Otherwise graph is not modified.
+    if (status.ok()) {
+      for (const Node* node : converted_segments.at(i).first) {
+        graph.RemoveNode(const_cast<Node*>(node));
+      }
+    }
   }
   cudaSetDevice(old_cuda_device);
   graph.ToGraphDef(params.output_graph_def);
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index adf8831b960172fc29b5d631e5b0533318d4764d..7b0c4b446d73e0d01eb3ab0c0431800b2661119b 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -879,6 +879,8 @@ Status Converter::ConvertNode(const NodeDef& node_def) {
     // We need to check the name before setting it. If the input is one of the
     // engine input, setting the name here will overwrite engine input
     // bindings which will cause runtime error.
+    // TODO(tmorris): Remove this work-around once we use TRT's IIdentityLayer
+    // in ConvertIdentity.
     if (output.is_tensor()) {
       const char* tensor_name = output.tensor()->getName();
       if (!tensorflow::str_util::StartsWith(tensor_name, kInputPHName)) {
@@ -939,6 +941,22 @@ Status Converter::RenameAndMarkOutputTensors(
     if (tensor == nullptr) {
       return errors::NotFound("Output tensor not found: ", output.first);
     }
+    // Check if this tensor has already been marked as an output.
+    // ConvertIdentity can cause the same tensor to be repeated in
+    // output_tensors, which can cause us to overwrite the name of the output
+    // tensor binding. For example, if we rename OutputPH_0 to OutputPH_1 then
+    // we won't be able to locate OutputPH_0 during runtime. To fix this,
+    // duplicate the tensor using no-op shuffle.
+    // TODO(tmorris): Remove this work-around once we use TRT's IIdentityLayer
+    // in ConvertIdentity.
+    if (tensorflow::str_util::StartsWith(tensor->getName(), kOutputPHName)) {
+      // Using shuffle layer for identity by not setting reshape or transpose.
+      nvinfer1::IShuffleLayer* layer = network()->addShuffle(*tensor);
+      TFTRT_RETURN_ERROR_IF_NULLPTR(
+          layer, StrCat("Output Copy for ", tensor->getName()));
+      MarkQuantizationRangesAsInferrable(tensor, layer->getOutput(0));
+      tensor = layer->getOutput(0);
+    }
     tensor->setName(output.second.c_str());
     VLOG(1) << "Marking output tensor " << output.first << ", as output tensor "
             << output.second;
@@ -1538,6 +1556,11 @@ enum class ConvolutionType { DEFAULT, DEPTHWISE_CONV };
 tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
+  if (inputs.size() != 2) {
+    return tensorflow::errors::InvalidArgument("Two inputs are expected for ",
+                                               node_def.op(), ", at ",
+                                               node_def.name());
+  }
   if (inputs.at(0).is_weights()) {
     return tensorflow::errors::Unimplemented(
         node_def.op(), " is only implemented for tensors, not weights, at ",
@@ -1549,39 +1572,61 @@ tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group) {
                                              node_def.name());
   }
   TRT_ShapedWeights weights_rsck = inputs.at(1).weights();
-  VLOG(2) << "weight shape: " << weights_rsck.DebugString();
   if (weights_rsck.shape_.nbDims != 4) {
-    return tensorflow::errors::Internal(
-        "Conv2D expects kernel of dimension 4, at: " + node_def.name());
+    return tensorflow::errors::InvalidArgument(
+        "Conv2D expects kernel of dimension 4, at " + node_def.name());
   }
+  TFAttrs attrs(node_def);
+  auto data_format = attrs.get<string>("data_format");
+  int c_index = (data_format == "NHWC") ? 3 : 1;
+  int h_index = (data_format == "NHWC") ? 1 : 2;
+  int w_index = (data_format == "NHWC") ? 2 : 3;
+  auto tf_dilations = attrs.get<std::vector<int>>("dilations");
+  if (tf_dilations.size() != 4) {
+    return tensorflow::errors::InvalidArgument(
+        "Convolution dilations field must specify 4 dimensions, at ",
+        node_def.name());
+  }
+  if (tf_dilations[0] != 1 || tf_dilations[c_index] != 1) {
+    return tensorflow::errors::Unimplemented(
+        "Dilation rate must be 1 for batch and channel dimensions, at ",
+        node_def.name());
+  }
+  const nvinfer1::DimsHW dilation(tf_dilations[h_index], tf_dilations[w_index]);
+
+  const auto tf_stride = attrs.get<std::vector<int>>("strides");
+  if (tf_stride.size() != 4) {
+    return tensorflow::errors::InvalidArgument(
+        "Convolution strides field must specify 4 dimensions, at ",
+        node_def.name());
+  }
+  if (tf_stride[0] != 1 || tf_stride[c_index] != 1) {
+    return tensorflow::errors::Unimplemented(
+        "Stride must be 1 for batch and channel dimensions, at ",
+        node_def.name());
+  }
+  const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
   if (params->validation_only) return tensorflow::Status::OK();
 
   const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
-  TFAttrs attrs(node_def);
 
-  int h_index = 2;
-  int w_index = 3;
-  auto data_format = attrs.get<string>("data_format");
-  if (data_format == "NHWC") {
+  // Transpose to NCHW (NCHW is required for IConvLayer).
+  const bool need_transpose = (data_format == "NHWC");
+  if (need_transpose) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
         const_cast<nvinfer1::ITensor*>(tensor), {0, 3, 1, 2}, &tensor));
-    h_index = 1;
-    w_index = 2;
-    // TODO(jie): transpose it
   }
-
-  // tensor after transpose (NCHW)
+  // Dimensions of transposed tensor.
   const auto tensor_dim = tensor->getDimensions();
 
-  int num_groups = group;
-  if (num_groups == 0) num_groups = tensor_dim.d[0];  // depthwise convolution
-  VLOG(2) << "groups count: " << num_groups;
+  // For depthwise convolution, group will be 0 so set num_groups to size of
+  // input's channel dim. For a non-depthwise conv, num_groups will be 1.
+  const int num_groups = (group == 0) ? tensor_dim.d[0] : group;
 
   if (params->converter->precision_mode() == FP16MODE) {
     weights_rsck =
         ConvertFP32ToFP16(params->weight_store, inputs.at(1).weights());
   }
-
   TRT_ShapedWeights weights =
       params->weight_store->GetTempWeights(weights_rsck);
   ReorderRSCKToKCRS(weights_rsck, &weights, num_groups);
@@ -1590,35 +1635,22 @@ tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group) {
   nvinfer1::DimsHW kernel_size;
   kernel_size.h() = weights.shape_.d[2];
   kernel_size.w() = weights.shape_.d[3];
-  VLOG(2) << "RSCK: " << weights.DebugString();
-  VLOG(2) << "kernel size: " << kernel_size.h() << ", " << kernel_size.w();
-
-  // TODO(jie): stride. (NHWC/NCHW)
-  const auto tf_stride = attrs.get<std::vector<int>>("strides");
-  VLOG(2) << "h_INDEX" << h_index << ", w_index " << w_index;
-  VLOG(2) << "stride: " << tf_stride[0] << tf_stride[1] << tf_stride[2]
-          << tf_stride[3];
-  const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
 
+  // Add padding.
   std::vector<std::pair<int, int>> padding;
-  // TODO(jie): padding.
   if (attrs.get<string>("padding") == "SAME") {
-    // This is NCHW tensor with no batch dimension.
-    //  1 -> h
-    //  2 -> w
+    nvinfer1::DimsHW effective_kernel_size = kernel_size;
+    effective_kernel_size.h() += (kernel_size.h() - 1) * (dilation.h() - 1);
+    effective_kernel_size.w() += (kernel_size.w() - 1) * (dilation.w() - 1);
     padding = CreateSamePadding(
-        stride, kernel_size,
+        stride, effective_kernel_size,
         {static_cast<int>(tensor_dim.d[1]), static_cast<int>(tensor_dim.d[2])});
   } else {
     padding = {{0, 0}, {0, 0}};
   }
-
   if (padding[0].first != padding[0].second ||
       padding[1].first != padding[1].second) {
-    // TODO(jie): handle asymmetric padding
-    VLOG(2) << "Padding!!!: " << padding[0].first << padding[0].second
-            << padding[1].first << padding[1].second;
-    VLOG(2) << "TENSOR before: " << DebugString(tensor->getDimensions());
+    // Handle asymmetric padding.
     auto pad_layer = params->converter->network()->addPadding(
         *const_cast<nvinfer1::ITensor*>(tensor),
         nvinfer1::DimsHW(padding[0].first, padding[1].first),
@@ -1628,24 +1660,23 @@ tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group) {
         const_cast<nvinfer1::ITensor*>(tensor), pad_layer->getOutput(0));
     padding = {{0, 0}, {0, 0}};
     tensor = pad_layer->getOutput(0);
-    VLOG(2) << "TENSOR after: " << DebugString(tensor->getDimensions());
   }
 
+  // Add convolution.
   nvinfer1::IConvolutionLayer* layer =
       params->converter->network()->addConvolution(
           *const_cast<nvinfer1::ITensor*>(tensor), noutput, kernel_size,
           weights.GetTrtWeights(), biases.GetTrtWeights());
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-
   layer->setStride(stride);
   layer->setPadding({padding[0].first, padding[1].first});
   layer->setName(node_def.name().c_str());
   layer->setNbGroups(num_groups);
+  layer->setDilation(dilation);
   const nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-  VLOG(2) << "TENSOR out: " << DebugString(output_tensor->getDimensions());
-  VLOG(2) << "data_format: " << data_format;
-  if (data_format == "NHWC") {
-    // TODO(jie): transpose it back!
+
+  // Restore transpose.
+  if (need_transpose) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
         const_cast<nvinfer1::ITensor*>(output_tensor), {0, 2, 3, 1},
         &output_tensor));
@@ -3749,8 +3780,7 @@ tensorflow::Status ConvertGraphDefToEngine(
 tensorflow::Status ConvertSegmentToGraphDef(
     const tensorflow::Graph* graph,
     const tensorflow::grappler::GraphProperties& graph_properties,
-    const std::set<string>& subgraph_node_names,
-    const std::vector<int>& subgraph_node_ids,  // In topological order
+    const std::vector<const Node*>& subgraph_nodes,  // In topological order
     std::vector<EngineConnection>* connections,
     tensorflow::GraphDef* segment_def, string* common_scope) {
   std::set<string> marker_nodes;
@@ -3824,11 +3854,10 @@ tensorflow::Status ConvertSegmentToGraphDef(
 
   std::unordered_map<int, int> old_to_new_id_map;
   // Copy internal nodes to new graphdef
-  string local_scope = graph->FindNodeId(*subgraph_node_ids.begin())->name();
-  for (const auto node_id : subgraph_node_ids) {
-    const auto node = graph->FindNodeId(node_id);
+  string local_scope = subgraph_nodes.front()->name();
+  for (const Node* node : subgraph_nodes) {
     local_scope = GetCommonNameScope(local_scope, node->name());
-    old_to_new_id_map[node_id] = segment_def->node_size();
+    old_to_new_id_map[node->id()] = segment_def->node_size();
     auto snode = segment_def->add_node();
     snode->CopyFrom(node->def());
     VLOG(2) << "Copying " << snode->name() << " to subgraph";
@@ -3846,6 +3875,11 @@ tensorflow::Status ConvertSegmentToGraphDef(
             << placeholder_name;
     snode->set_input(connection.inside_port, placeholder_name);
   }
+  std::set<string> subgraph_node_names;
+  for (const Node* node : subgraph_nodes) {
+    subgraph_node_names.insert(node->name());
+  }
+
   // Remove control inputs that are not inside the segment.
   for (int i = 0; i < segment_def->node_size(); ++i) {
     auto snode = segment_def->mutable_node(i);
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index 54e19b73957bccdae2b23bd3556de9ad00b864e5..8f2271ee3f5af0198695c913c6757ef0f6d60d61 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -128,8 +128,7 @@ struct EngineInfo {
 tensorflow::Status ConvertSegmentToGraphDef(
     const tensorflow::Graph* graph,
     const tensorflow::grappler::GraphProperties& graph_properties,
-    const std::set<string>& subgraph_node_names,
-    const std::vector<int>& subgraph_node_ids,
+    const std::vector<const Node*>& subgraph_nodes,
     std::vector<EngineConnection>* connections,
     tensorflow::GraphDef* segment_def, string* common_scope);
 
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc
index a2ddfbffa5b0d8c421bcfe054097a9e42b79fe8f..f143f56d2094b2418df0122e6a7e6f5d41136f31 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc
@@ -2378,6 +2378,8 @@ TEST_F(OpConverterTest, ConvertStridedSlice) {
   };
 
   {
+    // Input is weights, should fail.
+    Reset();
     NodeDef node_def = get_strided_slice_nodedef();
     AddTestWeights<int32>("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
     AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
@@ -2619,6 +2621,240 @@ TEST_F(OpConverterTest, ConvertStridedSlice) {
   }
 }
 
+TEST_F(OpConverterTest, ConvertConv2D) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_conv2d", "Conv2D", {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Two inputs are expected for Conv2D, at my_conv2d");
+  }
+
+  // Get nodedef for Conv2D layer.
+  auto get_conv2d_nodedef =
+      [](std::vector<int> strides = {1, 1, 1, 1}, string padding = "SAME",
+         string data_format = "NCHW",
+         std::vector<int> dilations = {1, 1, 1, 1}) -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto filter = ops::Placeholder(s.WithOpName("weights"), DT_FLOAT);
+    ops::Conv2D::Attrs attrs =
+        ops::Conv2D::Attrs().DataFormat(data_format).Dilations(dilations);
+    auto conv2d = ops::Conv2D(s.WithOpName("my_conv2d"), input, filter, strides,
+                              padding, attrs);
+    return conv2d.operation.node()->def();
+  };
+
+  {
+    // Input is weights, should fail.
+    Reset();
+    NodeDef node_def = get_conv2d_nodedef();
+    AddTestWeights<float>("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
+    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Conv2D is only implemented for tensors, not weights, at my_conv2d");
+  }
+  {
+    // Filter is tensor, should fail.
+    Reset();
+    NodeDef node_def = get_conv2d_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("weights", {3, 3, 1, 1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Kernel for Conv2D must be constant weights, at my_conv2d");
+  }
+  {
+    // Filter is not 4D, should fail.
+    Reset();
+    NodeDef node_def = get_conv2d_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights", {3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Conv2D expects kernel of dimension 4, at my_conv2d");
+  }
+  {
+    // Dilations is not 4D, should fail.
+    Reset();
+    NodeDef node_def =
+        get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NCHW", {1, 1, 1});
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Convolution dilations field must specify 4 dimensions, at my_conv2d");
+  }
+  {
+    // Dilation value is not 1 for channel, should fail.
+    Reset();
+    NodeDef node_def =
+        get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NCHW", {1, 2, 1, 1});
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Dilation rate must be 1 for batch and channel "
+                               "dimensions, at my_conv2d");
+  }
+  {
+    // Dilation value is not 1 for channel (NHWC), should fail.
+    Reset();
+    NodeDef node_def =
+        get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NHWC", {1, 1, 1, 2});
+    AddTestTensor("input", {2, 3, 1});
+    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Dilation rate must be 1 for batch and channel "
+                               "dimensions, at my_conv2d");
+  }
+  {
+    // Strides is not 4D, should fail.
+    Reset();
+    NodeDef node_def =
+        get_conv2d_nodedef({1, 1, 1}, "SAME", "NCHW", {1, 1, 1, 1});
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Convolution strides field must specify 4 dimensions, at my_conv2d");
+  }
+  {
+    // Stride value is not 1 for channel, should fail.
+    Reset();
+    NodeDef node_def =
+        get_conv2d_nodedef({1, 2, 1, 1}, "SAME", "NCHW", {1, 1, 1, 1});
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Stride must be 1 for batch and channel dimensions, at my_conv2d");
+  }
+
+  struct TestParams {
+    TestParams(const std::vector<int>& input_dims,
+               const std::vector<float>& input,
+               const std::vector<int>& filter_dims,
+               const std::vector<float>& filter,
+               const std::vector<int>& strides, const string& padding,
+               const string& data_format, const std::vector<int>& dilations,
+               const std::vector<int>& expected_output_dims,
+               const std::vector<float>& expected_output)
+        : input_dims(input_dims),
+          input(input),
+          filter_dims(filter_dims),
+          filter(filter),
+          strides(strides),
+          padding(padding),
+          data_format(data_format),
+          dilations(dilations),
+          expected_output_dims(expected_output_dims),
+          expected_output(expected_output) {}
+
+    std::vector<int> input_dims;
+    std::vector<float> input;
+    std::vector<int> filter_dims;
+    std::vector<float> filter;
+    std::vector<int> strides;
+    string padding;
+    string data_format;
+    std::vector<int> dilations;
+    std::vector<int> expected_output_dims;
+    std::vector<float> expected_output;
+  };
+
+  // Ok.
+  const int kConv2DOKCases = 6;
+  TestParams ok_params[kConv2DOKCases] = {
+      // Basic
+      TestParams{/*input_dims=*/{1, 2, 3},
+                 /*input=*/{0, 1, 2, 3, 3, 4},
+                 /*filter_dims=*/{1, 2, 1, 1},
+                 /*filter=*/{-1, 1},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*padding=*/"VALID",
+                 /*data_format=*/"NCHW",
+                 /*dilations=*/{1, 1, 1, 1},
+                 /*expected_output_dims=*/{1, 2, 2},
+                 /*expected_output=*/{1, 1, 0, 1}},
+      // SAME padding (Asymmetric)
+      TestParams{/*input_dims=*/{1, 2, 3},
+                 /*input=*/{0, 1, 2, 3, 3, 4},
+                 /*filter_dims=*/{1, 2, 1, 1},
+                 /*filter=*/{-1, 1},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*padding=*/"SAME",
+                 /*data_format=*/"NCHW",
+                 /*dilations=*/{1, 1, 1, 1},
+                 /*expected_output_dims=*/{1, 2, 3},
+                 /*expected_output=*/{1, 1, -2, 0, 1, -4}},
+      // SAME padding (Symmetric)
+      TestParams{/*input_dims=*/{1, 2, 3},
+                 /*input=*/{0, 1, 2, 3, 3, 4},
+                 /*filter_dims=*/{1, 3, 1, 1},
+                 /*filter=*/{-1, 0, 1},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*padding=*/"SAME",
+                 /*data_format=*/"NCHW",
+                 /*dilations=*/{1, 1, 1, 1},
+                 /*expected_output_dims=*/{1, 2, 3},
+                 /*expected_output=*/{1, 2, -1, 3, 1, -3}},
+      // NHWC
+      TestParams{/*input_dims=*/{2, 3, 1},
+                 /*input=*/{0, 1, 2, 3, 3, 4},
+                 /*filter_dims=*/{1, 2, 1, 1},
+                 /*filter=*/{-1, 1},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*padding=*/"VALID",
+                 /*data_format=*/"NHWC",
+                 /*dilations=*/{1, 1, 1, 1},
+                 /*expected_output_dims=*/{2, 2, 1},
+                 /*expected_output=*/{1, 1, 0, 1}},
+      // Dilated
+      TestParams{/*input_dims=*/{1, 2, 3},
+                 /*input=*/{0, 1, 2, 3, 3, 4},
+                 /*filter_dims=*/{1, 2, 1, 1},
+                 /*filter=*/{-1, 1},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*padding=*/"VALID",
+                 /*data_format=*/"NCHW",
+                 /*dilations=*/{1, 1, 1, 2},
+                 /*expected_output_dims=*/{1, 2, 1},
+                 /*expected_output=*/{2, 1}},
+      // Strided
+      TestParams{/*input_dims=*/{1, 2, 4},
+                 /*input=*/{0, 1, 2, 2, 3, 4, 4, 7},
+                 /*filter_dims=*/{1, 2, 1, 1},
+                 /*filter=*/{-1, 1},
+                 /*strides=*/{1, 1, 1, 2},
+                 /*padding=*/"VALID",
+                 /*data_format=*/"NCHW",
+                 /*dilations=*/{1, 1, 1, 1},
+                 /*expected_output_dims=*/{1, 2, 2},
+                 /*expected_output=*/{1, 0, 1, 3}},
+  };
+
+  for (int i = 0; i < kConv2DOKCases; i++) {
+    Reset();
+    NodeDef node_def =
+        get_conv2d_nodedef(ok_params[i].strides, ok_params[i].padding,
+                           ok_params[i].data_format, ok_params[i].dilations);
+    AddTestTensor("input", ok_params[i].input_dims);
+    AddTestWeights<float>("weights", ok_params[i].filter_dims,
+                          ok_params[i].filter);
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_conv2d", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
+                             output.tensor()->getDimensions());
+    std::vector<float> output_data(ok_params[i].expected_output.size());
+    BuildAndRun<float>({{"input", ok_params[i].input}}, "my_conv2d",
+                       &output_data);
+    EXPECT_THAT(output_data, ElementsAreArray(ok_params[i].expected_output));
+  }
+}
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index 203b2697babe32b45523109708cbf062dceee33b..92377edaecc8eb3bda3fcb12abb7aab48feedb2f 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -45,12 +45,19 @@ from tensorflow.python.saved_model import loader_impl
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.training import saver
 
-if _six.PY2:
-  _to_bytes = lambda s: s
-  _to_string = lambda s: s
-else:
-  _to_bytes = lambda s: s.encode("utf-8", errors="surrogateescape")
-  _to_string = lambda s: s.decode("utf-8")
+
+def _to_bytes(s):
+  """Encode s if it is a sequence of chars."""
+  if isinstance(s, _six.text_type):
+    return s.encode("utf-8", errors="surrogateescape")
+  return s
+
+
+def _to_string(s):
+  """Decode s if it is a sequence of bytes."""
+  if isinstance(s, _six.binary_type):
+    return s.decode("utf-8")
+  return s
 
 
 class TrtPrecisionMode(object):
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resources.h b/tensorflow/contrib/tensorrt/resources/trt_resources.h
index aac9e5c7bd725fc10bcaa04536ebc7be071b4d4c..8d877b392faa6a6773aea573b23bd02300051f05 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resources.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_resources.h
@@ -48,7 +48,7 @@ class TRTCalibrationResource : public tensorflow::ResourceBase {
     allocator_.reset();
   }
 
-  string DebugString() override {
+  string DebugString() const override {
     std::stringstream oss;
     using std::dec;
     using std::endl;
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc
index 084a96e0fa5c97edc58adf2590ed94e5ef0e4d85..ecaffa3023bc8f317d956181b44639bc80efda29 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment.cc
@@ -673,10 +673,11 @@ tensorflow::Status SegmentGraph(
   // --------------------------------- Step 3 ---------------------------------
   // Convert the segments into the expected return format
   for (const auto& itr : sg_map) {
-    const std::set<const tensorflow::Node*, NodePtrCompare>& segment_nodes =
-        itr.second;
+    const string& segment_root = itr.first;
+    // Return format does not require set comparator.
+    std::set<const Node*> segment_nodes(itr.second.begin(), itr.second.end());
     if (VLOG_IS_ON(1)) {
-      string s = "parent=" + itr.first + ":";
+      string s = "parent=" + segment_root + ":";
       for (auto node : segment_nodes) s += " " + node->name();
       VLOG(1) << "Segment " << segments->size() << ": " << s;
     }
@@ -689,12 +690,10 @@ tensorflow::Status SegmentGraph(
     }
 
     // TODO(sami): Make segmenter placement aware once trtscopes are in place
-    std::set<string> segment_node_names;
-    for (auto node : itr.second) segment_node_names.insert(node->name());
-    const auto& dev_itr = device_maps.find(itr.first);
+    const auto& dev_itr = device_maps.find(segment_root);
     if (dev_itr == device_maps.end() || dev_itr->second.empty()) {
       VLOG(1) << "No device assigned to segment " << segments->size();
-      segments->emplace_back(std::make_pair(segment_node_names, string()));
+      segments->emplace_back(std::make_pair(segment_nodes, string()));
     } else if (dev_itr->second.size() > 1) {
       string s("Segment ");
       StrAppend(&s, segments->size(), " has multiple devices attached: ");
@@ -703,10 +702,10 @@ tensorflow::Status SegmentGraph(
       }
       LOG(WARNING) << s << " choosing " << *(dev_itr->second.begin());
       segments->emplace_back(
-          std::make_pair(segment_node_names, *(dev_itr->second.begin())));
+          std::make_pair(segment_nodes, *(dev_itr->second.begin())));
     } else {
       segments->emplace_back(
-          std::make_pair(segment_node_names, *(dev_itr->second.begin())));
+          std::make_pair(segment_nodes, *(dev_itr->second.begin())));
     }
   }
   if (VLOG_IS_ON(1)) {
diff --git a/tensorflow/contrib/tensorrt/segment/segment.h b/tensorflow/contrib/tensorrt/segment/segment.h
index b9693aad1b764515459db6833b05221ea5b3a2d1..6cc92cdb5df396a6bca26119f152487bc3685a6d 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.h
+++ b/tensorflow/contrib/tensorrt/segment/segment.h
@@ -29,10 +29,10 @@ namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 
-// Vector of segments, each entry contains a set of node names and a device name
-// in the segment.
-// TODO(aaroey): use node pointer instead of node name.
-using SegmentNodesVector = std::vector<std::pair<std::set<string>, string>>;
+// Vector of segments, each entry contains a set of node pointers and a device
+// name in the segment.
+using SegmentNodesVector =
+    std::vector<std::pair<std::set<const Node*>, string>>;
 
 struct SegmentOptions {
   // Segment must contain at least this many nodes.
diff --git a/tensorflow/contrib/tensorrt/segment/segment_test.cc b/tensorflow/contrib/tensorrt/segment/segment_test.cc
index 4805ef9c61a7784a1c08cf5eaf504691bc9dbedc..4ac02327ae68069278066b6e7e931bb9449c2603 100644
--- a/tensorflow/contrib/tensorrt/segment/segment_test.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment_test.cc
@@ -75,7 +75,10 @@ class SegmentTest : public ::testing::Test {
                        const std::vector<std::set<string>>& expected_segments) {
     EXPECT_EQ(expected_segments.size(), segments.size());
     for (int i = 0; i < segments.size(); ++i) {
-      const auto& segment_node_names = segments[i].first;
+      std::set<string> segment_node_names;
+      for (const Node* node : segments[i].first) {
+        segment_node_names.insert(node->name());
+      }
       const auto& expected = expected_segments[i];
       for (const auto& name : expected) {
         EXPECT_TRUE(segment_node_names.count(name))
diff --git a/tensorflow/contrib/tensorrt/test/conv2d_test.py b/tensorflow/contrib/tensorrt/test/conv2d_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4cee3cce00c7f17af82f1135c272258cfb66833
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/conv2d_test.py
@@ -0,0 +1,191 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_nn_ops
+from tensorflow.python.platform import test
+
+
+def conv2d_layer(inputs,
+                 filters,
+                 kernel_size,
+                 strides=(1, 1),
+                 padding="valid",
+                 data_format="channels_last",
+                 dilation_rate=(1, 1),
+                 name=None):
+  dtype = inputs.dtype
+  c_axis = -1 if data_format == "channels_last" else 1
+  nchan = inputs.shape[c_axis]
+  weights_shape = (kernel_size[0], kernel_size[1], nchan, filters)
+  weights = constant_op.constant(np.random.randn(*weights_shape), dtype=dtype)
+  padding = padding.upper()
+  if data_format == "channels_last":
+    strides = [1] + list(strides) + [1]
+    dilations = [1] + list(dilation_rate) + [1]
+    data_format = "NHWC"
+  else:
+    strides = [1, 1] + list(strides)
+    dilations = [1, 1] + list(dilation_rate)
+    data_format = "NCHW"
+  return gen_nn_ops.conv2d(
+      inputs,
+      weights,
+      strides=strides,
+      padding=padding,
+      dilations=dilations,
+      data_format=data_format)
+
+
+def div_round_up(n, d):
+  return (n - 1) // d + 1
+
+
+def build_graph(input_dims,
+                dtype,
+                num_filters,
+                data_format,
+                kernel_sizes,
+                dilation_rates,
+                padding="same"):
+  g = ops.Graph()
+  with g.as_default():
+    inp = array_ops.placeholder(
+        dtype=dtype, shape=[None] + input_dims[1:], name="input")
+    with g.device("/GPU:0"):
+      results = []
+      for kernel_size in kernel_sizes:
+        for dilation_rate in dilation_rates:
+          result = conv2d_layer(inp, num_filters, kernel_size, (1, 1), padding,
+                                data_format, dilation_rate)
+          results.append(result)
+      output = sum(results)
+      output = array_ops.identity(output, name="output")
+  return g
+
+
+class Conv2DNCHWTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Testing conversion of Conv2D (data_format=NCHW) in TF-TRT conversion."""
+    np.random.seed(1234)
+    input_dims = [13, 3, 7, 11]
+    g = build_graph(
+        input_dims=input_dims,
+        dtype=dtypes.float32,
+        num_filters=5,
+        data_format="channels_first",
+        kernel_sizes=[(3, 3), (3, 2)],
+        dilation_rates=[(1, 1), (2, 3)])
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=["input"],
+        input_dims=[input_dims],
+        output_names=["output"],
+        expected_output_dims=[(13, 5, 7, 11)])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return ["TRTEngineOp_0"]
+
+
+class Conv2DNHWCTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Testing conversion of Conv2D (data_format=NCHW) in TF-TRT conversion."""
+    np.random.seed(1234)
+    input_dims = [13, 7, 11, 3]
+    g = build_graph(
+        input_dims=input_dims,
+        dtype=dtypes.float32,
+        num_filters=5,
+        data_format="channels_last",
+        kernel_sizes=[(3, 3), (3, 2)],
+        dilation_rates=[(1, 1), (2, 3)])
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=["input"],
+        input_dims=[input_dims],
+        output_names=["output"],
+        expected_output_dims=[(13, 7, 11, 5)])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return ["TRTEngineOp_0"]
+
+
+class Conv2DStridedNCHWTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Testing conversion of strided Conv2D (data_format=NCHW) in TF-TRT
+
+    conversion.
+    """
+    np.random.seed(1234)
+    dtype = dtypes.float32
+    input_name = "input"
+    n, c, h, w = 13, 3, 7, 11
+    num_filters = 5
+    input_dims = [n, c, h, w]
+    output_name = "output"
+    g = ops.Graph()
+    with g.as_default():
+      inp = array_ops.placeholder(
+          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
+      with g.device("/GPU:0"):
+        output = inp
+        output = conv2d_layer(
+            output,
+            num_filters, (3, 2),
+            strides=(2, 2),
+            padding="same",
+            data_format="channels_first")
+        h = div_round_up(h, 2)
+        w = div_round_up(w, 2)
+        output = conv2d_layer(
+            output,
+            num_filters, (3, 3),
+            strides=(2, 2),
+            dilation_rate=(2, 3),
+            padding="same",
+            data_format="channels_first")
+        h = div_round_up(h, 2)
+        w = div_round_up(w, 2)
+        output = array_ops.identity(output, name=output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        output_names=[output_name],
+        expected_output_dims=[(n, num_filters, h, w)])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return ["TRTEngineOp_0"]
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/identity_output_test.py b/tensorflow/contrib/tensorrt/test/identity_output_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1c6dd52040f73f4adf8152e19717122798755bb
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/identity_output_test.py
@@ -0,0 +1,80 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""This test checks a situation where the same tensor is considered as an output
+
+multiple times because it has been duplicated by 2+ indentity ops. Previously,
+the tensor would be renamed multiple times, overwriting the output binding name
+which resulted in a runtime error when the binding would not be found.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class IdentityTest(trt_test.TfTrtIntegrationTestBase):
+
+  def _ConstOp(self, shape):
+    return constant_op.constant(np.random.randn(*shape), dtype=dtypes.float32)
+
+  def GetParams(self):
+    """Testing engine with the same tensor repeated as output via identity."""
+    input_name = 'input'
+    input_dims = [100, 32]
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(
+          dtype=dtypes.float32, shape=input_dims, name=input_name)
+
+      b = self._ConstOp((32, 4))
+      x1 = math_ops.matmul(x, b)
+      b = self._ConstOp((1, 4))
+      x1 = x1 + b
+
+      out1 = array_ops.identity(x1, name='output1')
+      out2 = array_ops.identity(x1, name='output2')
+      iden1 = array_ops.identity(x1)
+      out3 = array_ops.identity(iden1, name='output3')
+
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        output_names=['output1', 'output2', 'output3'],
+        expected_output_dims=[(100, 4), (100, 4), (100, 4)])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return ['TRTEngineOp_0']
+
+  def ShouldRunTest(self, run_params):
+    """Whether to run the test."""
+    # TODO(aaroey): Trt 4.0 forbids conversion for tensors with rank <3 in int8
+    # mode, which is a bug. Re-enable this when trt library is fixed.
+    return not trt_test.IsQuantizationMode(run_params.precision_mode)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py b/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py
index e7d6ec4ad395d38a06f97020f2f363009f2286c7..79dde2a6b577acc7664aef37354e3a2d8e408cc2 100644
--- a/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py
+++ b/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py
@@ -144,7 +144,10 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
           outputs=[OUTPUT_NODE_NAME],
           max_batch_size=max_batch_size,
           precision_mode='INT8',
-          max_workspace_size_bytes=4096 << 19,
+          # There is a 2GB GPU memory limit for each test, so we set
+          # max_workspace_size_bytes to 256MB to leave enough room for TF
+          # runtime to allocate GPU memory.
+          max_workspace_size_bytes=1 << 28,
           minimum_segment_size=2,
           use_calibration=False,
       )
diff --git a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
index 495a9391a1e818a6078988161c9bf72f6143737f..671abba6a6862eb1c7a339d1897be0bb8ff90d30 100644
--- a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
+++ b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
@@ -239,8 +239,8 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
 
   def _GetConfigProto(self, run_params, graph_state):
     """Get config proto based on specific settings."""
+    conversion_params = self.GetConversionParams(run_params)
     if graph_state != GraphState.ORIGINAL and run_params.use_optimizer:
-      conversion_params = self.GetConversionParams(run_params)
       rewriter_cfg = trt_convert.get_tensorrt_rewriter_config(
           conversion_params.rewriter_config, conversion_params.max_batch_size,
           conversion_params.max_workspace_size_bytes,
@@ -254,6 +254,9 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
       graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_cfg)
     else:
       graph_options = config_pb2.GraphOptions()
+      if conversion_params.rewriter_config is not None:
+        graph_options.rewrite_options.CopyFrom(
+            conversion_params.rewriter_config)
 
     config = config_pb2.ConfigProto(
         gpu_options=self._GetGPUOptions(), graph_options=graph_options)
@@ -321,16 +324,13 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     return self._RunGraph(
         run_params, gdef, input_data, config, GraphState.CALIBRATE, num_runs=5)
 
-  def _GetTrtGraphDef(self, run_params, gdef):
+  def _GetTrtGraphDef(self, run_params, graph_state, gdef):
     """Return trt converted graphdef."""
     params = self._GetParamsCached()
     conversion_params = self.GetConversionParams(run_params)
     logging.info(conversion_params)
 
-    config_for_trt = config_pb2.ConfigProto(gpu_options=self._GetGPUOptions())
-    if conversion_params.rewriter_config is not None:
-      config_for_trt.graph_options.rewrite_options.CopyFrom(
-          conversion_params.rewriter_config)
+    config_for_trt = self._GetConfigProto(run_params, graph_state)
     return trt_convert.create_inference_graph(
         input_graph_def=gdef,
         outputs=params.input_names + params.output_names,
@@ -506,7 +506,8 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         result = self._RunCalibration(run_params, input_gdef, input_data,
                                       calib_config)
       else:
-        calib_gdef = self._GetTrtGraphDef(run_params, input_gdef)
+        calib_gdef = self._GetTrtGraphDef(run_params, GraphState.CALIBRATE,
+                                          input_gdef)
         self._VerifyGraphDef(run_params, calib_gdef, GraphState.CALIBRATE)
         result = self._RunCalibration(run_params, calib_gdef, input_data,
                                       calib_config)
@@ -527,7 +528,8 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     logging.info("Running final inference graph, config:\n%s",
                  str(infer_config))
     if not run_params.use_optimizer:
-      infer_gdef = self._GetTrtGraphDef(run_params, infer_gdef)
+      infer_gdef = self._GetTrtGraphDef(run_params, GraphState.INFERENCE,
+                                        infer_gdef)
       self._VerifyGraphDef(run_params, infer_gdef, GraphState.INFERENCE)
 
     result = self._RunGraph(run_params, infer_gdef, input_data, infer_config,
diff --git a/tensorflow/contrib/timeseries/examples/predict_test.py b/tensorflow/contrib/timeseries/examples/predict_test.py
index 678fd71cd8b94ee0be46e10a9a673de55bd44215..b353f85cb5df0cf961d1900b241e4fa1a84a24b4 100644
--- a/tensorflow/contrib/timeseries/examples/predict_test.py
+++ b/tensorflow/contrib/timeseries/examples/predict_test.py
@@ -43,10 +43,6 @@ class PeriodTrendExampleTest(test.TestCase):
     self.assertAllEqual([700], mean.shape)
     self.assertAllEqual([700], upper_limit.shape)
     self.assertAllEqual([700], lower_limit.shape)
-    # Check that variance hasn't blown up too much. This is a relatively good
-    # indication that training was successful.
-    self.assertLess(upper_limit[-1] - lower_limit[-1],
-                    1.5 * (upper_limit[0] - lower_limit[0]))
 
   def test_ar(self):
     (times, observed, all_times, mean,
@@ -55,7 +51,6 @@ class PeriodTrendExampleTest(test.TestCase):
     self.assertAllEqual(all_times.shape, mean.shape)
     self.assertAllEqual(all_times.shape, upper_limit.shape)
     self.assertAllEqual(all_times.shape, lower_limit.shape)
-    self.assertLess((upper_limit - lower_limit).mean(), 4.)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index 4b90b596b28efec83aa349782c4874d79b6817c7..a0c3204d41d8a715fe572223308a32eb39e0b963 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -361,9 +361,10 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":feature_keys",
+        ":math_utils",
         ":model",
         ":model_utils",
-        "//tensorflow/contrib/distributions:distributions_py",
+        "//tensorflow/contrib/rnn:rnn_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:constant_op",
diff --git a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
index bcadf4094e1e79fff1685515f2bde0b88f717cac..a8d5e1a49dd4313f58f2f515bc3f292ecce5cbd4 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
@@ -18,9 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib import distributions
-
 from tensorflow.contrib.rnn.python.ops import lstm_ops
+from tensorflow.contrib.timeseries.python.timeseries import math_utils
 from tensorflow.contrib.timeseries.python.timeseries import model
 from tensorflow.contrib.timeseries.python.timeseries import model_utils
 from tensorflow.contrib.timeseries.python.timeseries.feature_keys import PredictionFeatures
@@ -462,8 +461,8 @@ class ARModel(model.TimeSeriesModel):
     if self.loss == ARModel.NORMAL_LIKELIHOOD_LOSS:
       covariance = prediction_ops["covariance"]
       sigma = math_ops.sqrt(gen_math_ops.maximum(covariance, 1e-5))
-      normal = distributions.Normal(loc=targets, scale=sigma)
-      loss_op = -math_ops.reduce_sum(normal.log_prob(prediction))
+      loss_op = -math_ops.reduce_sum(
+          math_utils.normal_log_prob(targets, sigma, prediction))
     else:
       assert self.loss == ARModel.SQUARED_LOSS, self.loss
       loss_op = math_ops.reduce_sum(math_ops.square(prediction - targets))
@@ -965,16 +964,11 @@ class AnomalyMixtureARModel(ARModel):
       anomaly_variance = prediction_ops["anomaly_params"]
       anomaly_sigma = math_ops.sqrt(
           gen_math_ops.maximum(anomaly_variance, 1e-5))
-      normal = distributions.Normal(loc=targets, scale=anomaly_sigma)
-      log_prob = normal.log_prob(prediction)
+      log_prob = math_utils.normal_log_prob(targets, anomaly_sigma, prediction)
     else:
       assert self._anomaly_distribution == AnomalyMixtureARModel.CAUCHY_ANOMALY
       anomaly_scale = prediction_ops["anomaly_params"]
-      cauchy = distributions.StudentT(
-          df=array_ops.ones([], dtype=anomaly_scale.dtype),
-          loc=targets,
-          scale=anomaly_scale)
-      log_prob = cauchy.log_prob(prediction)
+      log_prob = math_utils.cauchy_log_prob(targets, anomaly_scale, prediction)
     return log_prob
 
   def loss_op(self, targets, prediction_ops):
@@ -983,8 +977,7 @@ class AnomalyMixtureARModel(ARModel):
     covariance = prediction_ops["covariance"]
     # Normal data log probability.
     sigma = math_ops.sqrt(gen_math_ops.maximum(covariance, 1e-5))
-    normal1 = distributions.Normal(loc=targets, scale=sigma)
-    log_prob1 = normal1.log_prob(prediction)
+    log_prob1 = math_utils.normal_log_prob(targets, sigma, prediction)
     log_prob1 += math_ops.log(1 - self._anomaly_prior_probability)
     # Anomaly log probability.
     log_prob2 = self._anomaly_log_prob(targets, prediction_ops)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
index aab330643862c1ccf073d2a0e34e1c475b1ec15f..b7375e5055e29efea3f23c3b9b9f3af59f45495b 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 import collections
 import math
 
+import numpy as np
+
 from tensorflow.contrib import lookup
 from tensorflow.contrib.layers.python.layers import layers
 
@@ -43,6 +45,32 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import nest
 
 
+def normal_log_prob(loc, scale, x):
+  """Computes the Normal log pdf."""
+  z = (x - loc) / scale
+  return -0.5 * (math_ops.square(z)
+                 + np.log(2. * np.pi) + math_ops.log(scale))
+
+
+def cauchy_log_prob(loc, scale, x):
+  """Computes the Cauchy log pdf."""
+  z = (x - loc) / scale
+  return (-np.log(np.pi) - math_ops.log(scale) -
+          math_ops.log1p(math_ops.square(z)))
+
+
+def mvn_tril_log_prob(loc, scale_tril, x):
+  """Computes the MVN log pdf under tril scale. Doesn't handle batches."""
+  x0 = x - loc
+  z = linalg_ops.matrix_triangular_solve(
+      scale_tril, x0[..., array_ops.newaxis])[..., 0]
+  log_det_cov = 2. * math_ops.reduce_sum(math_ops.log(
+      array_ops.matrix_diag_part(scale_tril)), axis=-1)
+  d = math_ops.cast(array_ops.shape(scale_tril)[-1], log_det_cov.dtype)
+  return -0.5 * (math_ops.reduce_sum(math_ops.square(z), axis=-1)
+                 + d * np.log(2. * np.pi) + log_det_cov)
+
+
 def clip_covariance(
     covariance_matrix, maximum_variance_ratio, minimum_variance):
   """Enforce constraints on a covariance matrix to improve numerical stability.
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
index 125750e7639ad40c481472a93353e6fb7055be96..cf5e749042afd83f927a3d22edfd3a9538ab2ffd 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
@@ -78,7 +78,6 @@ py_library(
     srcs = ["kalman_filter.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/distributions:distributions_py",
         "//tensorflow/contrib/timeseries/python/timeseries:math_utils",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
@@ -235,7 +234,6 @@ py_library(
     srcs = ["filtering_postprocessor.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/distributions:distributions_py",
         "//tensorflow/contrib/timeseries/python/timeseries:math_utils",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/filtering_postprocessor.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/filtering_postprocessor.py
index e9e2ac0aaf4c4d6c41f5007662f261af3de9bbd1..3fa2fbd9f77cb887c30fde264815728ca345f45a 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/filtering_postprocessor.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/filtering_postprocessor.py
@@ -22,8 +22,6 @@ import abc
 
 import six
 
-from tensorflow.contrib import distributions
-
 from tensorflow.contrib.timeseries.python.timeseries import math_utils
 
 from tensorflow.python.framework import dtypes
@@ -91,10 +89,10 @@ def cauchy_alternative_to_gaussian(current_times, current_values, outputs):
   """
   del current_times  # unused
   cauchy_scale = math_utils.entropy_matched_cauchy_scale(outputs["covariance"])
-  individual_log_pdfs = distributions.StudentT(
-      df=array_ops.ones([], dtype=current_values.dtype),
+  individual_log_pdfs = math_utils.cauchy_log_prob(
       loc=outputs["mean"],
-      scale=cauchy_scale).log_prob(current_values)
+      scale=cauchy_scale,
+      x=current_values)
   return math_ops.reduce_sum(individual_log_pdfs, axis=1)
 
 
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py
index a614386121e000961bf8b32625a28e1251654320..c0ec797bc5b7c41ca996c807840ce38311201f87 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib import distributions
-
 from tensorflow.contrib.timeseries.python.timeseries import math_utils
 
 from tensorflow.python.framework import dtypes
@@ -137,9 +135,10 @@ class KalmanFilter(object):
     with ops.control_dependencies([non_negative_assert]):
       observation_covariance_cholesky = linalg_ops.cholesky(
           symmetrized_observation_covariance)
-    log_prediction_prob = distributions.MultivariateNormalTriL(
-        predicted_observation, observation_covariance_cholesky).log_prob(
-            observation)
+    log_prediction_prob = math_utils.mvn_tril_log_prob(
+        loc=predicted_observation,
+        scale_tril=observation_covariance_cholesky,
+        x=observation)
     (posterior_state,
      posterior_state_var) = self.posterior_from_prior_state(
          prior_state=estimated_state,
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index 3cbf265f68f28d374daeeb44183e8e82a796ac7f..b9c225b44a2f03c8f6d1ef23f75cb5df5667a002 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -70,12 +70,15 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":async_checkpoint",
+        ":functional",
         ":tpu_lib",
+        ":tpu_ordinal_selector_py",
         "//tensorflow/contrib/training:training_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:function",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
@@ -155,6 +158,24 @@ tf_gen_op_wrapper_py(
     ],
 )
 
+tf_custom_op_library(
+    name = "python/ops/_tpu_ordinal_selector.so",
+    srcs = ["ops/tpu_ordinal_selector_op.cc"],
+)
+
+tf_custom_op_py_library(
+    name = "tpu_ordinal_selector_py",
+    srcs = ["ops/gen_tpu_ordinal_selector_op.py"],
+    dso = [":python/ops/_tpu_ordinal_selector.so"],
+    kernels = [
+        ":tpu_ordinal_selector_op_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tpu_ordinal_selector_op",
+    ],
+)
+
 tf_gen_op_wrapper_py(
     name = "tpu_ordinal_selector_op",
     deps = [
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/setup.py b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
index f27ae38e0434991da7475e631be1c6cb4a463118..807cf26fe983b4ebe17695d6f4f90ecfc0e0cbf5 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/setup.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
@@ -33,7 +33,7 @@ setup(
     long_description='Tools for capture TPU profile',
     url='https://www.tensorflow.org/tfrc/',
     author='Google Inc.',
-    author_email='opensource@google.com',
+    author_email='packages@tensorflow.org',
     packages=['cloud_tpu_profiler'],
     package_data={
         'cloud_tpu_profiler': ['data/*'],
diff --git a/tensorflow/contrib/tpu/python/tpu/feature_column.py b/tensorflow/contrib/tpu/python/tpu/feature_column.py
index d5d00d628d407bf3bb5312bd54f6ccd13dc37db4..8edf131bc24fd003806263570b63ee8514c49896 100644
--- a/tensorflow/contrib/tpu/python/tpu/feature_column.py
+++ b/tensorflow/contrib/tpu/python/tpu/feature_column.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
 import math
 
 from tensorflow.contrib.tpu.python.tpu import tpu
@@ -279,11 +278,10 @@ class _TPUEmbeddingColumn(_TPUBaseEmbeddingColumn, fc._EmbeddingColumn):
 
   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
     if tpu.under_tpu_inference_context():
-      # TODO(shizhiw, b/112012627, b/112336539): Replace _outside_all_rewrites()
-      # with outside compilation.
-      with _outside_all_rewrites():
+      def host_computation():
         return fc._EmbeddingColumn._get_dense_tensor(
             self, inputs, weight_collections, trainable)
+      return tpu.outside_compilation(host_computation)
 
     if _is_running_on_cpu():
       return fc._EmbeddingColumn._get_dense_tensor(
@@ -300,13 +298,6 @@ class _TPUEmbeddingColumn(_TPUBaseEmbeddingColumn, fc._EmbeddingColumn):
     return tensor
 
 
-@contextlib.contextmanager
-def _outside_all_rewrites():
-  """'Break out' of a tpu.rewrite() (or shard(), etc.)."""
-  with ops.control_dependencies(None):
-    yield
-
-
 class _TPUSharedEmbeddingColumn(_TPUBaseEmbeddingColumn,
                                 fc._SharedEmbeddingColumn):
   """Core Shared Embedding Column."""
@@ -385,11 +376,10 @@ class _TPUSharedEmbeddingColumn(_TPUBaseEmbeddingColumn,
 
   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
     if tpu.under_tpu_inference_context():
-      # TODO(shizhiw, b/112012627, b/112336539): Replace _outside_all_rewrites()
-      # with outside compilation.
-      with _outside_all_rewrites():
+      def host_computation():
         return fc._SharedEmbeddingColumn._get_dense_tensor(
             self, inputs, weight_collections, trainable)
+      return tpu.outside_compilation(host_computation)
 
     if _is_running_on_cpu():
       return fc._SharedEmbeddingColumn._get_dense_tensor(
diff --git a/tensorflow/contrib/tpu/python/tpu/session_support.py b/tensorflow/contrib/tpu/python/tpu/session_support.py
index 3e463823c820a3ef8628324f77e1a9caf8d385d5..f5735cecc38b7033f21fc4d4105cfead233379fa 100644
--- a/tensorflow/contrib/tpu/python/tpu/session_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/session_support.py
@@ -185,7 +185,8 @@ def all_worker_devices(session):
   """Return a list of devices for each worker in the system."""
   devices = session.list_devices()
   return [
-      device.name for device in devices
+      device.name
+      for device in devices
       if ':CPU:' in device.name and 'coordinator' not in device.name
   ]
 
@@ -255,12 +256,14 @@ class WatchdogManager(threading.Thread):
     self._worker_manager.configure(
         event_pb2.WorkerHeartbeatRequest(
             watchdog_config=event_pb2.WatchdogConfig(
-                timeout_ms=self.shutdown_timeout * 1000,)))
+                timeout_ms=self.shutdown_timeout * 1000,),
+            shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR))
 
   def configure_and_run(self):
-    logging.info('Enabling watchdog timer with %d second timeout '
-                 'and %d second ping interval.',
-                 self.shutdown_timeout, self.ping_interval)
+    logging.info(
+        'Enabling watchdog timer with %d second timeout '
+        'and %d second ping interval.', self.shutdown_timeout,
+        self.ping_interval)
     self._reset_manager()
     self._running = True
     self.start()
@@ -269,7 +272,8 @@ class WatchdogManager(threading.Thread):
     logging.info('Stopping worker watchdog.')
     self._worker_manager.configure(
         event_pb2.WorkerHeartbeatRequest(
-            watchdog_config=event_pb2.WatchdogConfig(timeout_ms=-1,)))
+            watchdog_config=event_pb2.WatchdogConfig(timeout_ms=-1,),
+            shutdown_mode=event_pb2.NOT_CONFIGURED))
     self._running = False
     self.join()
 
diff --git a/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py b/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py
index 66689cde2fb53c1d9909224fcf38e5e9a7dd729b..bf492e78a15acc92017663a286e8c8f0b2045339 100644
--- a/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py
+++ b/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py
@@ -59,6 +59,7 @@ _REASON_SCALAR_GET_TRACED = 'traced-scalar'
 _REASON_TENSOR_GET_TRACED = 'traced-tensor'
 _REASON_USER_INCLUDED = 'traced-user-included'
 _REASON_USER_EXCLUDED = 'not-traced-user-excluded'
+_REASON_NOT_EXECUTED = 'not-traced-not-in-exec-path'
 _REASON_NON_NUMERIC_TENSOR = 'not-traced-non-numeric-tensor'
 _MARKER_SECTION_BEGIN = '!!!!!!! section-begin:'
 _MARKER_SECTION_END = '!!!!!!! section-end:'
@@ -377,10 +378,7 @@ class TensorTracer(object):
       return True
     # Reasons for not including following op types:
     #    Assign: cause incorrect result with CPU tracing.
-    #    others: compilation problems.
-    # TODO(deveci): Check if 'Pack', 'Shape', 'Reshape', 'ArgMin', 'ArgMax'
-    #               are still unsafe now that we have handled int64 tensors.
-    if op.type in ['Assign', 'Pack', 'Shape', 'Reshape', 'ArgMin', 'ArgMax']:
+    if op.type in ['Assign']:
       return True
     return False
 
@@ -471,7 +469,7 @@ class TensorTracer(object):
                 temporarily_marked_ops, sorted_ops)
       # pylint: disable=protected-access
       for ctrl_output_op in op._control_outputs:
-      # pylint: enable=protected-access
+        # pylint: enable=protected-access
         visit(ctrl_output_op, cycle, permanently_marked_ops,
               temporarily_marked_ops, sorted_ops)
       temporarily_marked_ops.remove(op)
@@ -737,6 +735,59 @@ class TensorTracer(object):
       self._write_report('%d "%s"\n'%(i, l[i].name))
     self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_GRAPH))
 
+  def _preprocess_traced_tensor(self, tensor):
+    """Computes NAN/Norm/Max on TPUs before sending to CPU.
+
+    Args:
+      tensor: The tensor to be traced.
+    Returns:
+      A tensor that should be input to the trace_function.
+    Raises:
+      RuntimeError: If the trace mode is invalid.
+    """
+
+    def _detect_nan_inf(tensor):
+      """Trace function for detecting any NaN/Inf in the tensor."""
+
+      if tensor.dtype.is_floating:
+        output_tensor = math_ops.reduce_any(
+            gen_math_ops.logical_or(
+                gen_math_ops.is_nan(tensor), gen_math_ops.is_inf(tensor)))
+      else:
+        output_tensor = constant_op.constant(False)
+      # The shape has to be 1. Set it if it does not have the information.
+      output_tensor = array_ops.reshape(output_tensor, [1])
+      return output_tensor
+
+    def _show_norm(tensor):
+      tensor = math_ops.cast(tensor, dtypes.float32)
+      output_tensor = linalg_ops.norm(tensor)
+      # The shape has to be 1. Set it if it does not have the information.
+      output_tensor = array_ops.reshape(output_tensor, [1])
+      return output_tensor
+
+    def _show_max_abs(tensor):
+      tensor = math_ops.cast(tensor, dtypes.float32)
+      output_tensor = math_ops.reduce_max(math_ops.abs(tensor))
+      zero = constant_op.constant(0, dtypes.float32)
+      output_tensor = gen_math_ops.maximum(zero, output_tensor)
+      # The shape has to be 1. Set it if it does not have the information.
+      output_tensor = array_ops.reshape(output_tensor, [1])
+      return output_tensor
+
+    if self._trace_mode == _TRACE_MODE_NAN_INF:
+      return _detect_nan_inf(tensor)
+    if self._trace_mode == _TRACE_MODE_PART_TENSOR:
+      return tensor
+    if self._trace_mode == _TRACE_MODE_FULL_TENSOR:
+      return tensor
+    if self._trace_mode == _TRACE_MODE_NORM:
+      return _show_norm(tensor)
+    if self._trace_mode == _TRACE_MODE_MAX_ABS:
+      return _show_max_abs(tensor)
+    raise RuntimeError(
+        'Tensor trace fun for %s is not yet implemented' % self._trace_mode)
+
   def _make_tensor_trace_fun(self, tensor_name):
     """Makes the tensor tracing function called by outside compilation.
 
@@ -787,29 +838,6 @@ class TensorTracer(object):
       with ops.control_dependencies([print_op]):
         return array_ops.identity(tensor).op
 
-    def _detect_nan_inf(tensor):
-      """Trace function for detecting any NaN/Inf in the tensor."""
-
-      if tensor.dtype.is_floating:
-        output_tensor = math_ops.reduce_any(
-            gen_math_ops.logical_or(gen_math_ops.is_nan(tensor),
-                                    gen_math_ops.is_inf(tensor)))
-      else:
-        output_tensor = constant_op.constant(False)
-
-      return _print_tensor(tensor_name, -1, tensor, output_tensor)
-
-    def _show_norm(tensor):
-      tensor = math_ops.cast(tensor, dtypes.float64)
-      output_tensor = linalg_ops.norm(tensor)
-      return _print_tensor(tensor_name, -1, tensor, output_tensor)
-
-    def _show_max_abs(tensor):
-      output_tensor = math_ops.cast(math_ops.reduce_max(math_ops.abs(tensor)),
-                                    dtypes.float64)
-      zero = constant_op.constant(0, dtypes.float64)
-      output_tensor = gen_math_ops.maximum(zero, output_tensor)
-      return _print_tensor(tensor_name, -1, tensor, output_tensor)
 
     def _show_part_tensor(tensor):
       """Trace function for printing part of the tensor."""
@@ -822,21 +850,23 @@ class TensorTracer(object):
 
       return _print_tensor(tensor_name, -1, tensor, tensor)
 
-    if self._trace_mode == _TRACE_MODE_NAN_INF:
-      return _detect_nan_inf
     if self._trace_mode == _TRACE_MODE_PART_TENSOR:
       return _show_part_tensor
-    if self._trace_mode == _TRACE_MODE_FULL_TENSOR:
+    # The input tensor has a shape of "[1]" for _TRACE_MODE_NAN_INF,
+    # _TRACE_MODE_NORM, and _TRACE_MODE_MAX_ABS, as related computations are
+    # performed within TPUs and only their results are transferred to CPU.
+    # Simply, print the full tensor for these trace modes.
+    if self._trace_mode in [
+        _TRACE_MODE_NAN_INF, _TRACE_MODE_NORM, _TRACE_MODE_FULL_TENSOR,
+        _TRACE_MODE_MAX_ABS
+    ]:
       return _show_full_tensor
-    if self._trace_mode == _TRACE_MODE_NORM:
-      return _show_norm
-    if self._trace_mode == _TRACE_MODE_MAX_ABS:
-      return _show_max_abs
 
     raise RuntimeError('Tensor trace fun for %s is not yet implemented'
                        %self._trace_mode)
 
-  def _skip_op(self, op_id, op, user_included, user_excluded):
+  def _skip_op(self, op_id, op, user_included, user_excluded,
+               in_exec_path=True):
     """Returns True if we should not trace Op."""
 
     if user_included:
@@ -847,6 +877,10 @@ class TensorTracer(object):
       self._instrument_records[op.name] = TensorTracer.reason(
           op_id, _REASON_USER_EXCLUDED)
       return True
+    if not in_exec_path:
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_NOT_EXECUTED)
+      return True
     if not self._inside_op_range(op_id):
       self._instrument_records[op.name] = TensorTracer.reason(
           op_id, _REASON_OUTSIDE_OP_RANGE)
@@ -889,9 +923,18 @@ class TensorTracer(object):
           op_id, _REASON_USER_EXCLUDED)
       return True
     if not out_tensor.get_shape().is_fully_defined():
-      self._instrument_records[out_tensor.name] = TensorTracer.reason(
-          op_id, _REASON_DYNAMIC_SHAPE)
-      return True
+      # If trace mode is nan-inf, norm or max, then the tensor will be reduced
+      # to a scalar before the outside compilation call.
+      if self._trace_mode in [
+          _TRACE_MODE_NAN_INF, _TRACE_MODE_NORM, _TRACE_MODE_MAX_ABS
+      ]:
+        self._instrument_records[out_tensor.name] = TensorTracer.reason(
+            op_id, _REASON_TENSOR_GET_TRACED)
+        return False
+      else:
+        self._instrument_records[out_tensor.name] = TensorTracer.reason(
+            op_id, _REASON_DYNAMIC_SHAPE)
+        return True
     rank = len(out_tensor.shape)
     if rank < 1:
       # scalar
@@ -909,6 +952,40 @@ class TensorTracer(object):
           op_id, _REASON_TENSOR_GET_TRACED)
       return False
 
+  def _filter_execution_path_operations(self, operations, fetches):
+    """Returns the set of ops in the execution path to compute given fetches."""
+    # If no fetch provided, then return all operations.
+    if fetches is None:
+      return set(operations)
+    # Convert to list, if a single element is provided.
+    if not isinstance(fetches, (list, tuple)):
+      fetches = [fetches]
+    # If a tensor is given as fetch, convert it to op.
+    op_fetches = []
+    for fetch in fetches:
+      if isinstance(fetch, ops.Operation):
+        op_fetches.append(fetch)
+      elif isinstance(fetch, ops.Tensor):
+        op_fetches.append(fetch.op)
+      else:
+        raise RuntimeError('Given fetch:%s is neither a tensor nor an op.'
+                           %fetch)
+
+    execution_path_operations = set(op_fetches)
+    traverse_stack = list(op_fetches)
+    while True:
+      if not traverse_stack:
+        break
+      head_op = traverse_stack.pop()
+      input_ops = [tensor_input.op for tensor_input in head_op.inputs]
+      input_ops.extend(head_op.control_inputs)
+
+      for input_op in input_ops:
+        if input_op not in execution_path_operations:
+          execution_path_operations.add(input_op)
+          traverse_stack.append(input_op)
+    return execution_path_operations
+
   def _pre_tracing(self, graph):
     """Work needs to be done prior to TPU or CPU tracing."""
 
@@ -950,13 +1027,15 @@ class TensorTracer(object):
                                   _TENSOR_TRACER_CHECKPOINT))
     return checkpoint_operations
 
-  def trace_tpu(self, graph, result_tensor, num_replicas=None):
+  def trace_tpu(self, graph, result_tensor, num_replicas=None, fetches=None):
     """Traces the tensors generated by TPU Ops in a TF graph.
 
     Args:
       graph: the graph of Ops executed on the TPU.
       result_tensor: a result tensor of evaluating the graph.
       num_replicas: number of replicas used on the TPU.
+      fetches: the list of fetches given to session.run, used to determine the
+      ops in execution path. If None, the whole graph will be traced.
 
     Returns:
       A tuple (result_tensor_copy, tracing_ops), where:
@@ -985,6 +1064,10 @@ class TensorTracer(object):
     result_tensor_copy = self._add_replica_id_to_graph(num_replicas,
                                                        result_tensor)
     (operations, succeed, sorted_or_cycle) = self._pre_tracing(graph)
+    # Filter out the operations that won't be executed.
+    # if fetches=None, then ops_in_exec_path = set(operations)
+    ops_in_exec_path = self._filter_execution_path_operations(operations,
+                                                              fetches)
     tracing_ops = []
     checkpoint_operations = self._get_checkpoints(graph)
 
@@ -993,18 +1076,23 @@ class TensorTracer(object):
         continue
       user_included = self._is_user_included_op(op)
       user_excluded = self._is_user_excluded_op(op)
-      if self._skip_op(op_id, op, user_included, user_excluded):
+      in_exec_path = op in ops_in_exec_path
+      if self._skip_op(op_id, op, user_included, user_excluded, in_exec_path):
         continue
       for i in range(len(op.outputs)):
         out_tensor = op.outputs[i]
         if self._skip_tensor(op_id, out_tensor, user_included,
                              user_excluded):
           continue
+        # Create the list of consumers before calling _preprocess_traced_tensor.
+        # Otherwise, adding control input below, will introduce a cycle in the
+        # graph.
         consumers = out_tensor.consumers()
         tensor_name = out_tensor.name
-        out_tensor = _cast_unsupported_dtypes(out_tensor)
+        processed_out_tensor = self._preprocess_traced_tensor(out_tensor)
+        processed_out_tensor = _cast_unsupported_dtypes(processed_out_tensor)
         trace_op = tpu.outside_compilation(
-            self._make_tensor_trace_fun(tensor_name), out_tensor)
+            self._make_tensor_trace_fun(tensor_name), processed_out_tensor)
         if consumers:
           for consumer_op in consumers:
             # pylint: disable=protected-access
@@ -1050,8 +1138,9 @@ class TensorTracer(object):
         if self._skip_tensor(op_id, out_tensor, user_included,
                              user_excluded):
           continue
+        processed_out_tensor = self._preprocess_traced_tensor(out_tensor)
         trace_fun = self._make_tensor_trace_fun(out_tensor.name)
-        trace_call = (trace_fun, [out_tensor])
+        trace_call = (trace_fun, [processed_out_tensor])
         trace_call_key = 'tensor_tracing_cpu-%s:%d'%(op.name, i)
         tracing_calls[trace_call_key] = trace_call
     self._post_tracing(succeed, sorted_or_cycle)
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index 9266d81cf5fc035790062f0e307a5da0b01a9fc1..a267dd435c04c21bbeb3dc09a325267e7b22286e 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -36,6 +36,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
+from tensorflow.python.util import nest
 
 
 # Operations that indicate some error in the users graph, e.g. a placeholder
@@ -487,7 +488,11 @@ def replicate(computation,
     computation: A Python function that builds the computation to replicate.
     inputs: A list of lists of input tensors or `None` (equivalent to
       `[[]]`), indexed by `[replica_num][input_num]`. All replicas must
-      have the same number of inputs.
+      have the same number of inputs. Each input can be a nested structure
+      containing values that are convertible to tensors. Note that passing an
+      N-dimension list of compatible values will result in a N-dimention list of
+      scalar tensors rather than a single Rank-N tensors. If you need different
+      behavior, convert part of inputs to tensors with `tf.convert_to_tensor`.
     infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
       of arguments as inputs to computation.
     device_assignment: If not `None`, a `DeviceAssignment` describing the
@@ -526,7 +531,11 @@ def split_compile_and_replicate(computation,
     computation: A Python function that builds the computation to replicate.
     inputs: A list of lists of input tensors or `None` (equivalent to
       `[[]]`), indexed by `[replica_num][input_num]`. All replicas must
-      have the same number of inputs.
+      have the same number of inputs. Each input can be a nested structure
+      containing values that are convertible to tensors. Note that passing an
+      N-dimension list of compatible values will result in a N-dimention list of
+      scalar tensors rather than a single Rank-N tensors. If you need different
+      behavior, convert part of inputs to tensors with `tf.convert_to_tensor`.
     infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
       of arguments as inputs to computation.
     device_assignment: If not `None`, a `DeviceAssignment` describing the
@@ -580,24 +589,32 @@ def split_compile_and_replicate(computation,
   if num_replicas == 0:
     return []
 
+  # Checks all replicas have the same structure.
+  for i in xrange(1, num_replicas):
+    nest.assert_same_structure(inputs[0], inputs[i])
+
+  # Flatten inputs.
+  flat_inputs = [
+      nest.flatten(per_replica_input) for per_replica_input in inputs
+  ]
   # Converts inputs to Tensors.
-  inputs = [[ops.convert_to_tensor(x) for x in inp] for inp in inputs]
+  flat_inputs = [[ops.convert_to_tensor(x) for x in inp] for inp in flat_inputs]
 
   # Verifies that all replicas have matching numbers and types of inputs
-  input_types = [x.dtype for x in inputs[0]]
-  input_arity = len(input_types)
+  flat_input_types = [x.dtype for x in flat_inputs[0]]
+  input_arity = len(inputs[0])
+  flat_input_arity = len(flat_input_types)
   for i in range(num_replicas):
     if len(inputs[i]) != input_arity:
       raise ValueError("Replicas must have the same number of inputs. "
                        "Replica 0 had {} inputs, replica {} had {} "
                        "inputs.".format(input_arity, i, len(inputs[i])))
 
-    types = [x.dtype for x in inputs[i]]
-    if types != input_types:
-      raise ValueError(
-          "Replicas must have matching input types. Replica 0 had "
-          "input types {}, replica {} had input types {}".format(
-              input_types, i, types))
+    types = [x.dtype for x in flat_inputs[i]]
+    if types != flat_input_types:
+      raise ValueError("Replicas must have matching input types. Replica 0 had "
+                       "input types {}, replica {} had input types {}".format(
+                           flat_input_types, i, types))
 
   arg_error = xla.check_function_argument_count(
       computation, input_arity, infeed_queue)
@@ -620,8 +637,8 @@ def split_compile_and_replicate(computation,
 
   # Fan-in: Builds a TPUReplicatedInput node for each input.
   computation_inputs = []
-  for i in range(0, input_arity):
-    replicas = [inputs[replica][i] for replica in xrange(num_replicas)]
+  for i in range(0, flat_input_arity):
+    replicas = [flat_inputs[replica][i] for replica in xrange(num_replicas)]
     computation_inputs.append(
         tpu_ops.tpu_replicated_input(replicas, name="input{}".format(i)))
 
@@ -651,6 +668,10 @@ def split_compile_and_replicate(computation,
         i.op._set_attr("_tpu_input_identity", attr_value_pb2.AttrValue(b=True))
         # pylint: enable=protected-access
 
+      # Unflatten the computation inputs to match original input structure.
+      computation_inputs = nest.pack_sequence_as(
+          structure=inputs[0], flat_sequence=computation_inputs)
+
       # If there is an infeed queue, adds the dequeued values to the
       # computation's inputs.
       if infeed_queue is not None:
@@ -1092,6 +1113,11 @@ def rewrite(computation,
       All `Operation`s constructed during `computation` will be executed when
       evaluating any of the returned output tensors, not just the ones returned.
     inputs: A list of input tensors or `None` (equivalent to an empty list).
+      Each input can be a nested structure containing values that are
+      convertible to tensors. Note that passing an N-dimension list of
+      compatible values will result in a N-dimention list of scalar tensors
+      rather than a single Rank-N tensors. If you need different behavior,
+      convert part of inputs to tensors with `tf.convert_to_tensor`.
     infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
       of arguments as inputs to `computation`.
     device_assignment: if not `None`, a `DeviceAssignment` describing the
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 87a970f0523363426b0da5b12838b797d7f8bebb..075ecbc52a642ebe9c35e0b9ede8bd4ee963aec6 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -31,11 +31,13 @@ import six
 from six.moves import queue as Queue  # pylint: disable=redefined-builtin
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.contrib.tpu.ops import gen_tpu_ordinal_selector_op
 from tensorflow.contrib.tpu.proto import compilation_result_pb2 as tpu_compilation_result
-from tensorflow.contrib.tpu.python.tpu import tensor_tracer
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import error_handling
+from tensorflow.contrib.tpu.python.tpu import functional as tpu_functional
 from tensorflow.contrib.tpu.python.tpu import session_support
+from tensorflow.contrib.tpu.python.tpu import tensor_tracer
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_config
 from tensorflow.contrib.tpu.python.tpu import tpu_context
@@ -55,6 +57,7 @@ from tensorflow.python.estimator.export import export_output as export_output_li
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -90,6 +93,7 @@ _ONE_GIGABYTE = 1024 * 1024 * 1024
 _TPU_ENQUEUE_OPS = '_tpu_enqueue_ops'
 _TPU_TRAIN_OP = '_tpu_train_op'
 _REWRITE_FOR_INFERENCE_MODE = '_rewrite_for_inference'
+_KEY_WHEN_PREDICTIONS_IS_A_TENSOR = '_key_when_predictions_is_a_tensor'
 
 # Ideally _USE_TPU_KEY should be reserved as well. However there are already
 # models that make use of this key, thus it can not be reserved now to prevent
@@ -1303,6 +1307,44 @@ class _InputPipeline(object):
         logging.warn(err_msg)
 
 
+def call_computation(computation,
+                     experimental_exported_model_uses_all_cores=True):
+  """Call computation.
+
+  computation uses a single-core for TPU inference. If
+  `experimental_exported_model_uses_all_cores` is `True`, this function will
+  round-robin
+  computation among all TPU cores visible to the host; otherwise, it will use
+  a single core.
+
+  Args:
+    computation: A Python function that takes no inputs and builds computation
+      graph. If `computation` returns m outputs, this function will return a
+      list of m Tensors.
+    experimental_exported_model_uses_all_cores: Whether to round-robin among all
+      cores visible to the host, or to use a single core.
+
+  Returns:
+    A list of output tensors.
+  """
+  if experimental_exported_model_uses_all_cores:
+    # Using `TPUPartitionedCall` makes it possible to target a different
+    # TPU core with every `Session.run()` call. Note that the entire inference
+    # graph executes on a single core, and that invocations of this graph
+    # will round-robin among the cores attached to a host.
+    @function.Defun()
+    def tpu_subgraph():
+      return computation()
+
+    return tpu_functional.TPUPartitionedCall(
+        args=tpu_subgraph.captured_inputs,
+        device_ordinal=gen_tpu_ordinal_selector_op.tpu_ordinal_selector(),
+        Tout=[o.type for o in tpu_subgraph.definition.signature.output_arg],
+        f=tpu_subgraph)
+  else:
+    return computation()
+
+
 class _ModelFnWrapper(object):
   """A `model_fn` wrapper.
 
@@ -1370,7 +1412,8 @@ class _ModelFnWrapper(object):
       if tensor_tracer.TensorTracer.is_enabled():
         tt = tensor_tracer.TensorTracer()
         loss, tracing_ops = tt.trace_tpu(ops.get_default_graph(), loss,
-                                         self._ctx.num_replicas)
+                                         self._ctx.num_replicas,
+                                         fetches=[loss, train_op])
 
       # We must run train_op to update the variables prior to running the
       # outfeed.
@@ -2100,7 +2143,8 @@ class TPUEstimator(estimator_lib.Estimator):
                batch_axis=None,
                eval_on_tpu=True,
                export_to_tpu=True,
-               warm_start_from=None):
+               warm_start_from=None,
+               experimental_exported_model_uses_all_cores=False):
     """Constructs an `TPUEstimator` instance.
 
     Args:
@@ -2143,12 +2187,20 @@ class TPUEstimator(estimator_lib.Estimator):
       eval_on_tpu: If False, evaluation runs on CPU or GPU. In this case, the
         model_fn must return `EstimatorSpec` when called with `mode` as `EVAL`.
       export_to_tpu: If True, `export_savedmodel()` exports a metagraph for
-        serving on TPU besides the one on CPU.
+        serving on TPU besides the one on CPU. Note that unsupported export
+        modes such as EVAL will be ignored. For those modes, only a CPU model
+        will be exported. Currently, export_to_tpu only supports PREDICT.
       warm_start_from: Optional string filepath to a checkpoint or SavedModel to
         warm-start from, or a `tf.estimator.WarmStartSettings` object to fully
         configure warm-starting.  If the string filepath is provided instead of
         a `WarmStartSettings`, then all variables are warm-started, and it is
         assumed that vocabularies and Tensor names are unchanged.
+      experimental_exported_model_uses_all_cores: Whether to round-robin among
+        all cores visible to the host which is serving the saved model, or to
+        use a single core. This is a temporary flag to enable using all TPU
+        cores for inference with TPUPartitionedCall(). Once outside compilation
+        is supported in TPUPartitionedCall(), this flag will be enabled by
+        default.
 
     Raises:
       ValueError: `params` has reserved keys already.
@@ -2213,6 +2265,8 @@ class TPUEstimator(estimator_lib.Estimator):
         use_tpu, eval_on_tpu)
 
     self._export_to_tpu = export_to_tpu
+    self._experimental_exported_model_uses_all_cores = (
+        experimental_exported_model_uses_all_cores)
 
     self._is_input_fn_invoked = None
     self._rendezvous = {}
@@ -2226,10 +2280,9 @@ class TPUEstimator(estimator_lib.Estimator):
                                export_tags=None,
                                check_variables=True):
     if self._export_to_tpu and mode != model_fn_lib.ModeKeys.PREDICT:
-      raise NotImplementedError(
-          'TPUEstimator only handles mode PREDICT for exporting '
-          'when `export_to_tpu` is `True`; '
-          'got {}.'.format(mode))
+      logging.warning('TPUEstimator only handles mode PREDICT for exporting '
+                      'when `export_to_tpu` is `True`; Mode {} will be ignored '
+                      'for TPU.'.format(mode))
 
     (super(TPUEstimator, self)._add_meta_graph_for_mode(
         builder,
@@ -2240,7 +2293,7 @@ class TPUEstimator(estimator_lib.Estimator):
         export_tags=export_tags,
         check_variables=check_variables))
 
-    if self._export_to_tpu:
+    if self._export_to_tpu and mode == model_fn_lib.ModeKeys.PREDICT:
       input_receiver_fn_map = {
           _REWRITE_FOR_INFERENCE_MODE: input_receiver_fn_map[mode]
       }
@@ -2269,6 +2322,79 @@ class TPUEstimator(estimator_lib.Estimator):
       raise ValueError('mode must be {}; '
                        'got {}.'.format(_REWRITE_FOR_INFERENCE_MODE, mode))
 
+    computation, capture = self._build_computation_for_inference(
+        features, labels, mode, config)
+    tensors = call_computation(
+        computation,
+        experimental_exported_model_uses_all_cores=self
+        ._experimental_exported_model_uses_all_cores)
+    estimator_spec, export_outputs_dict, predictions_dict, none_indices = (
+        capture.get())
+    predictions_list = tensors[:len(predictions_dict)]
+    export_outputs_list_without_none = tensors[len(predictions_dict):]
+
+    # Reinsert `None`s which we've taken out in
+    # `_build_computation_for_inference()`.
+    export_outputs_list = []
+    while none_indices or export_outputs_list_without_none:
+      if none_indices and none_indices[0] == len(export_outputs_list):
+        export_outputs_list.append(None)
+        none_indices.pop(0)
+      else:
+        export_outputs_list.append(export_outputs_list_without_none.pop(0))
+
+    # Reconstruct `export_outputs` with updated tensors.
+    new_export_outputs_dict = nest.pack_sequence_as(export_outputs_dict,
+                                                    export_outputs_list)
+    export_outputs = estimator_spec.export_outputs
+    new_export_outputs = collections.OrderedDict(
+        (k, _clone_export_output_with_tensors(export_outputs[k], v))
+        for k, v in six.iteritems(new_export_outputs_dict))
+    # Reconstruct `predictions` with updated tensors.
+    new_predictions = nest.pack_sequence_as(predictions_dict, predictions_list)
+    if (len(new_predictions) == 1 and
+        _KEY_WHEN_PREDICTIONS_IS_A_TENSOR in new_predictions):
+      new_predictions = new_predictions[_KEY_WHEN_PREDICTIONS_IS_A_TENSOR]
+
+    return estimator_spec._replace(
+        export_outputs=new_export_outputs, predictions=new_predictions)
+
+  def _build_computation_for_inference(self, features, labels, mode, config):
+    capture = _CapturedObject()
+
+    def computation():
+      """Computation to be passed to `TPUPartitionedCall()`."""
+      tpu_computation, tpu_capture = self._build_tpu_computation_for_inference(
+          features, labels, mode, config)
+
+      tensors_on_cpu = tpu.rewrite_for_inference(tpu_computation)
+      (estimator_spec, export_outputs_dict, export_outputs_list,
+       predictions_dict) = (
+           tpu_capture.get())
+      predictions_list = tensors_on_cpu[:len(predictions_dict)]
+      export_outputs_tpu_on_cpu_list = tensors_on_cpu[len(predictions_dict):]
+
+      # Reconstruct tensors used in export_outputs, with TPU tensors replaced
+      # with their CPU counterpart returned from `rewrite_for_inference()`.
+      # `function.Defun()` does not like `None`s in return values, so we leave
+      # `None`s out but record their positions for later reconstruction.
+      export_outputs_list_without_none = []
+      none_indices = []
+      for i, t in enumerate(export_outputs_list):
+        if t is None:
+          none_indices.append(i)
+        else:
+          export_outputs_list_without_none.append(
+              export_outputs_tpu_on_cpu_list.pop(0))
+
+      capture.capture((estimator_spec, export_outputs_dict, predictions_dict,
+                       none_indices))
+      return predictions_list + export_outputs_list_without_none
+
+    return computation, capture
+
+  def _build_tpu_computation_for_inference(self, features, labels, mode,
+                                           config):
     capture = _CapturedObject()
 
     def computation():
@@ -2289,38 +2415,30 @@ class TPUEstimator(estimator_lib.Estimator):
 
       # We pick the TPU tensors out from `export_output` and later return them
       # from `computation` for rewriting.
-      tensors_dict = collections.OrderedDict(
+      export_outputs_dict = collections.OrderedDict(
           (k, _export_output_to_tensors(v))
           for k, v in six.iteritems(estimator_spec.export_outputs))
-      tensors = nest.flatten(tensors_dict)
-      tpu_tensors = [t for t in tensors if t is not None]
-
-      # We cannot return anything other than `tpu_tensors` here so we capture
-      # the rest for later use.
-      capture.capture((estimator_spec, tensors_dict, tensors))
-      return tpu_tensors
-
-    tpu_tensors_on_cpu = tpu.rewrite_for_inference(computation)
-    estimator_spec, tensors_dict, tensors = capture.get()
-
-    # Reconstruct `tensors`, but with `tpu_tensors` replaced with
-    # `tpu_tensors_on_cpu`.
-    new_tensors = []
-    for t in tensors:
-      if t is None:
-        new_tensors.append(None)
+      export_outputs_list = nest.flatten(export_outputs_dict)
+      export_outputs_tpu_list = [
+          t for t in export_outputs_list if t is not None
+      ]
+
+      if isinstance(estimator_spec.predictions, dict):
+        predictions_dict = collections.OrderedDict(
+            (k, v) for k, v in six.iteritems(estimator_spec.predictions))
       else:
-        new_tensors.append(tpu_tensors_on_cpu.pop(0))
+        predictions_dict = {
+            _KEY_WHEN_PREDICTIONS_IS_A_TENSOR: estimator_spec.predictions
+        }
+      predictions_list = nest.flatten(predictions_dict)
 
-    # Reconstruct `tensors_dict`.
-    new_tensors_dict = nest.pack_sequence_as(tensors_dict, new_tensors)
-    # Reconstruct `export_outputs`.
-    export_outputs = estimator_spec.export_outputs
-    new_export_outputs = collections.OrderedDict(
-        (k, _clone_export_output_with_tensors(export_outputs[k], v))
-        for k, v in six.iteritems(new_tensors_dict))
+      # We cannot return everything we want through the return values, so
+      # capture the rest here for later use.
+      capture.capture((estimator_spec, export_outputs_dict, export_outputs_list,
+                       predictions_dict))
+      return predictions_list + export_outputs_tpu_list
 
-    return estimator_spec._replace(export_outputs=new_export_outputs)
+    return computation, capture
 
   def _create_global_step(self, graph):
     """Creates a global step suitable for TPUs.
@@ -2538,7 +2656,7 @@ class TPUEstimator(estimator_lib.Estimator):
         if self._log_every_n_steps is not None:
           examples_hook = ExamplesPerSecondHook(
               ctx.global_batch_size,
-              output_dir=self.model_dir,
+              output_dir=self.model_dir if config.save_summary_steps else None,
               every_n_steps=self._log_every_n_steps)
 
         if ctx.is_running_on_cpu(is_export_mode=is_export_mode):
diff --git a/tensorflow/contrib/training/python/training/hparam.py b/tensorflow/contrib/training/python/training/hparam.py
index bcc177601b95172b05d327247bd370c2f8b65d59..27f0d9b2e38c433d4fb4573285ecb8c9946112e8 100644
--- a/tensorflow/contrib/training/python/training/hparam.py
+++ b/tensorflow/contrib/training/python/training/hparam.py
@@ -499,6 +499,7 @@ class HParams(object):
       value: New value of the hyperparameter.
 
     Raises:
+      KeyError: If the hyperparameter doesn't exist.
       ValueError: If there is a type mismatch.
     """
     param_type, is_list = self._hparam_types[name]
@@ -517,6 +518,8 @@ class HParams(object):
   def del_hparam(self, name):
     """Removes the hyperparameter with key 'name'.
 
+    Does nothing if it isn't present.
+
     Args:
       name: Name of the hyperparameter.
     """
@@ -525,19 +528,20 @@ class HParams(object):
       del self._hparam_types[name]
 
   def parse(self, values):
-    """Override hyperparameter values, parsing new values from a string.
+    """Override existing hyperparameter values, parsing new values from a string.
 
     See parse_values for more detail on the allowed format for values.
 
     Args:
-      values: String.  Comma separated list of `name=value` pairs where
-        'value' must follow the syntax described above.
+      values: String.  Comma separated list of `name=value` pairs where 'value'
+        must follow the syntax described above.
 
     Returns:
       The `HParams` instance.
 
     Raises:
-      ValueError: If `values` cannot be parsed.
+      ValueError: If `values` cannot be parsed or a hyperparameter in `values`
+      doesn't exist.
     """
     type_map = dict()
     for name, t in self._hparam_types.items():
@@ -548,7 +552,7 @@ class HParams(object):
     return self.override_from_dict(values_map)
 
   def override_from_dict(self, values_dict):
-    """Override hyperparameter values, parsing new values from a dictionary.
+    """Override existing hyperparameter values, parsing new values from a dictionary.
 
     Args:
       values_dict: Dictionary of name:value pairs.
@@ -557,6 +561,7 @@ class HParams(object):
       The `HParams` instance.
 
     Raises:
+      KeyError: If a hyperparameter in `values_dict` doesn't exist.
       ValueError: If `values_dict` cannot be parsed.
     """
     for name, value in values_dict.items():
@@ -596,7 +601,7 @@ class HParams(object):
         sort_keys=sort_keys)
 
   def parse_json(self, values_json):
-    """Override hyperparameter values, parsing new values from a json object.
+    """Override existing hyperparameter values, parsing new values from a json object.
 
     Args:
       values_json: String containing a json object of name:value pairs.
@@ -605,6 +610,7 @@ class HParams(object):
       The `HParams` instance.
 
     Raises:
+      KeyError: If a hyperparameter in `values_json` doesn't exist.
       ValueError: If `values_json` cannot be parsed.
     """
     values_map = json.loads(values_json)
diff --git a/tensorflow/contrib/training/python/training/training.py b/tensorflow/contrib/training/python/training/training.py
index c272a2ac144068cfb7355c2647eebf5bd0ce9d50..fc6e38ab4a5243cb7502f4ca42db03cbfd342a40 100644
--- a/tensorflow/contrib/training/python/training/training.py
+++ b/tensorflow/contrib/training/python/training/training.py
@@ -419,7 +419,7 @@ def create_train_op(total_loss,
     update_ops = set(update_ops)
   if not global_update_ops.issubset(update_ops):
     logging.warning('update_ops in create_train_op does not contain all the '
-                    ' update_ops in GraphKeys.UPDATE_OPS')
+                    'update_ops in GraphKeys.UPDATE_OPS')
 
   # Make sure update_ops are computed before total_loss.
   if update_ops:
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 2abaadc657858c800db61d08af2d47e8324a546b..e6af9211b55097303e3fefba1ceb7b9e53a37d90 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1371,7 +1371,7 @@ cc_library(
 
 # This includes implementations of all kernels built into TensorFlow.
 cc_library(
-    name = "all_kernels_statically_linked",
+    name = "all_kernels_impl",
     visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/core/kernels:array",
@@ -1387,7 +1387,6 @@ cc_library(
         "//tensorflow/core/kernels:ctc_ops",
         "//tensorflow/core/kernels:cudnn_rnn_kernels",
         "//tensorflow/core/kernels:data_flow",
-        "//tensorflow/core/kernels:dataset_ops",
         "//tensorflow/core/kernels:decode_proto_op",
         "//tensorflow/core/kernels:encode_proto_op",
         "//tensorflow/core/kernels:fake_quant_ops",
@@ -1398,7 +1397,6 @@ cc_library(
         "//tensorflow/core/kernels:image",
         "//tensorflow/core/kernels:io",
         "//tensorflow/core/kernels:linalg",
-        "//tensorflow/core/kernels:list_kernels",
         "//tensorflow/core/kernels:lookup",
         "//tensorflow/core/kernels:logging",
         "//tensorflow/core/kernels:manip",
@@ -1461,8 +1459,13 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = if_dynamic_kernels(
         [],
-        otherwise = [":all_kernels_statically_linked"],
-    ),
+        otherwise = [":all_kernels_impl"],
+    ) + [
+        # TODO(gunan): Work on the API between these and rest of TF and make
+        # these also dynamically loading.
+        "//tensorflow/core/kernels:dataset_ops",  # Depends on grappler
+        "//tensorflow/core/kernels:list_kernels",  # Depends on variant_op_registry.h
+    ],
 )
 
 tf_cuda_library(
@@ -1747,6 +1750,7 @@ cc_library(
 cc_library(
     name = "mobile_additional_lib_deps",
     deps = tf_additional_lib_deps() + [
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
@@ -2006,6 +2010,13 @@ tf_pyclif_proto_library(
     visibility = ["//visibility:public"],
 )
 
+tf_pyclif_proto_library(
+    name = "framework/step_stats_pyclif",
+    proto_lib = ":protos_all_cc",
+    proto_srcfile = "framework/step_stats.proto",
+    visibility = ["//visibility:public"],
+)
+
 tf_pyclif_proto_library(
     name = "framework/types_pyclif",
     proto_lib = ":protos_all_cc",
@@ -2183,6 +2194,7 @@ cc_library(
         ],
     }),
     deps = tf_additional_lib_deps() + [
+        "@com_google_absl//absl/meta:type_traits",
         "@com_google_absl//absl/strings",
         "//third_party/eigen3",
         "@com_google_absl//absl/base:core_headers",
@@ -2337,7 +2349,12 @@ cc_library(
 
 cc_library(
     name = "tflite_portable_logging",
-    srcs = [],
+    srcs = [
+    ] + if_ios([
+        "platform/default/logging.cc",
+        "platform/env_time.cc",
+        "platform/posix/env_time.cc",
+    ]),
     hdrs = [
         "lib/bfloat16/bfloat16.h",
         "platform/default/integral_types.h",
@@ -2346,7 +2363,7 @@ cc_library(
         "platform/macros.h",
         "platform/platform.h",
         "platform/types.h",
-    ] + if_windows(["platform/windows/integral_types.h"]),
+    ] + if_windows(["platform/windows/integral_types.h"]) + if_ios(["platform/env_time.h"]),
     copts = tf_copts(),
     linkopts = ["-ldl"],
     deps = [
@@ -2801,6 +2818,7 @@ tf_cuda_library(
         ":proto_text",
         ":protos_all_cc",
         "//third_party/eigen3",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -2815,12 +2833,16 @@ CORE_CPU_BASE_HDRS = GRAPH_HDRS + [
     "framework/versions.h",
     "common_runtime/process_function_library_runtime.h",
     "common_runtime/function.h",
+    "common_runtime/scoped_allocator.h",
+    "common_runtime/scoped_allocator_mgr.h",
 ]
 
 tf_cuda_library(
     name = "core_cpu_base",
     srcs = [
         "common_runtime/eval_const_tensor.cc",
+        "common_runtime/scoped_allocator.cc",
+        "common_runtime/scoped_allocator_mgr.cc",
         "common_runtime/shape_refiner.cc",
         "common_runtime/shape_refiner.h",
         "framework/versions.h",
@@ -2880,6 +2902,7 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/mkl_cpu_allocator.h",
     "common_runtime/optimization_registry.h",
     "common_runtime/pending_counts.h",
+    "common_runtime/partitioning_utils.h",
     "common_runtime/placer.h",
     "common_runtime/process_util.h",
     "common_runtime/profile_handler.h",
@@ -2887,8 +2910,6 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/rendezvous_mgr.h",
     "common_runtime/rendezvous_util.h",
     "common_runtime/ring_reducer.h",
-    "common_runtime/scoped_allocator.h",
-    "common_runtime/scoped_allocator_mgr.h",
     "common_runtime/session_factory.h",
     "common_runtime/single_threaded_cpu_device.h",
     "common_runtime/stats_publisher_interface.h",
@@ -2936,6 +2957,7 @@ tf_cuda_library(
         "common_runtime/mkl_cpu_allocator.cc",
         "common_runtime/optimization_registry.cc",
         "common_runtime/parallel_concat_optimizer.cc",
+        "common_runtime/partitioning_utils.cc",
         "common_runtime/placer.cc",
         "common_runtime/pool_allocator.cc",
         "common_runtime/process_function_library_runtime.cc",
@@ -2945,8 +2967,6 @@ tf_cuda_library(
         "common_runtime/rendezvous_mgr.cc",
         "common_runtime/rendezvous_util.cc",
         "common_runtime/ring_reducer.cc",
-        "common_runtime/scoped_allocator.cc",
-        "common_runtime/scoped_allocator_mgr.cc",
         "common_runtime/session.cc",
         "common_runtime/session_factory.cc",
         "common_runtime/session_options.cc",
@@ -2974,8 +2994,9 @@ tf_cuda_library(
         ":proto_text",
         ":protos_all_cc",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         "//third_party/eigen3",
-        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/utils:functions",
     ] + mkl_deps(),
     alwayslink = 1,
 )
@@ -3490,6 +3511,29 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "platform_fake_python_env_test",
+    size = "small",
+    srcs = ["platform/fake_python_env_test.cc"],
+    args = [
+        "/some/path/to/pythontest.runfiles/org_tensorflow/stuff/to/run.py",
+    ],
+    tags = [
+        "local",
+        "no_windows",
+        "nogpu",
+        "nomac",
+        "notap",
+    ],
+    deps = [
+        ":lib",
+        ":lib_internal",
+        ":lib_test_internal",
+        ":test",
+        ":test_main",
+    ],
+)
+
 tf_cc_test(
     name = "platform_abi_test",
     size = "small",
@@ -4198,7 +4242,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "common_runtime_process_function_library_runtime_test",
     size = "small",
     srcs = ["common_runtime/process_function_library_runtime_test.cc"],
@@ -4207,6 +4251,7 @@ tf_cc_test(
         ":core_cpu",
         ":core_cpu_internal",
         ":framework",
+        ":framework_internal",
         ":lib",
         ":test",
         ":test_main",
@@ -4215,6 +4260,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:resource_variable_ops",
     ],
 )
 
@@ -4256,6 +4302,27 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "common_runtime_partitioning_utils_test",
+    size = "small",
+    srcs = ["common_runtime/partitioning_utils_test.cc"],
+    deps = [
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":framework",
+        ":lib",
+        ":ops",
+        ":test",
+        ":test_main",
+        ":testlib",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:identity_op",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "common_runtime_direct_session_test",
     size = "small",
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv2D.pbtxt
index 070d6adb978e4a62e7209f299dba08515aa21e83..d0794de4ba4a174838547865e4f1692cff503052 100644
--- a/tensorflow/core/api_def/base_api/api_def_Conv2D.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Conv2D.pbtxt
@@ -33,6 +33,15 @@ END
     name: "padding"
     description: <<END
 The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "explicit_paddings"
+    description: <<END
+If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+dimension, the amount of padding inserted before and after the dimension is
+`explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+`padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilter.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilter.pbtxt
index ff2d9d71db646a27a88763f79bb6beb6b5ede44b..c8af9ff976688a0db78d26a495543cc3c052944a 100644
--- a/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilter.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilter.pbtxt
@@ -41,6 +41,15 @@ END
     name: "padding"
     description: <<END
 The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "explicit_paddings"
+    description: <<END
+If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+dimension, the amount of padding inserted before and after the dimension is
+`explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+`padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInput.pbtxt
index 2de38b4263a380b5d0aec45270b9b67347c7021d..8aaae4aab6fd006931ce9f3ef1633a2c1e7c613b 100644
--- a/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInput.pbtxt
@@ -40,6 +40,15 @@ END
     name: "padding"
     description: <<END
 The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "explicit_paddings"
+    description: <<END
+If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+dimension, the amount of padding inserted before and after the dimension is
+`explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+`padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_RegexReplace.pbtxt b/tensorflow/core/api_def/base_api/api_def_RegexReplace.pbtxt
index 70ad5219267fcc84368f072a6f5a122b6cc11a89..2cc1a55676c354c9470287ccb89e39489ab18c02 100644
--- a/tensorflow/core/api_def/base_api/api_def_RegexReplace.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RegexReplace.pbtxt
@@ -10,7 +10,7 @@ op {
   }
   in_arg {
     name: "rewrite"
-    description: "The rewrite to be applied to the matched expresion."
+    description: "The rewrite to be applied to the matched expression."
   }
   out_arg {
     name: "output"
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt
index d9c4d5a4a4008c439ece7fde52a2913f6a50956d..b0458207e6eb8b18a21e1f67b84e691fb5601e9a 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt
@@ -28,10 +28,8 @@ be protected by a lock; otherwise the behavior is undefined,
 but may exhibit less contention.
 END
   }
-  summary: "Adds sparse `updates` to individual values or slices within a given"
+  summary: "Applies sparse addition to individual values or slices in a Variable."
   description: <<END
-variable according to `indices`.
-
 `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 
 `indices` must be integer tensor, containing indices into `ref`.
@@ -44,24 +42,24 @@ dimension of `ref`.
 `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 
 ```
-[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
 ```
 
-For example, say we want to update 4 scattered elements to a rank-1 tensor to
-8 elements. In Python, that update would look like this:
+For example, say we want to add 4 scattered elements to a rank-1 tensor to
+8 elements. In Python, that addition would look like this:
 
 ```python
-    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
-    indices = tf.constant([[4], [3], [1] ,[7]])
-    updates = tf.constant([9, 10, 11, 12])
-    update = tf.scatter_nd_add(ref, indices, updates)
-    with tf.Session() as sess:
-      print sess.run(update)
+ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+indices = tf.constant([[4], [3], [1], [7]])
+updates = tf.constant([9, 10, 11, 12])
+add = tf.scatter_nd_add(ref, indices, updates)
+with tf.Session() as sess:
+  print sess.run(add)
 ```
 
 The resulting update to ref would look like this:
 
-    [1, 12, 3, 14, 14, 6, 7, 20]
+    [1, 13, 3, 14, 14, 6, 7, 20]
 
 See `tf.scatter_nd` for more details about how to make updates to
 slices.
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdSub.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f12f4b5f34767e54bdd9c4ede9cb2c495eda723f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdSub.pbtxt
@@ -0,0 +1,67 @@
+op {
+  graph_op_name: "ResourceScatterNdSub"
+  in_arg {
+    name: "ref"
+    description: <<END
+A resource handle. Must be from a VarHandleOp.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A Tensor. Must be one of the following types: int32, int64.
+A tensor of indices into ref.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A Tensor. Must have the same type as ref. A tensor of
+values to add to ref.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+An optional bool. Defaults to True. If True, the assignment will
+be protected by a lock; otherwise the behavior is undefined,
+but may exhibit less contention.
+END
+  }
+  summary: "Applies sparse subtraction to individual values or slices in a Variable."
+  description: <<END
+`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+`indices` must be integer tensor, containing indices into `ref`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+The innermost dimension of `indices` (with length `K`) corresponds to
+indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+dimension of `ref`.
+
+`updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+```
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+```
+
+For example, say we want to subtract 4 scattered elements from a rank-1 tensor
+with 8 elements. In Python, that subtraction would look like this:
+
+```python
+ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+indices = tf.constant([[4], [3], [1], [7]])
+updates = tf.constant([9, 10, 11, 12])
+sub = tf.scatter_nd_sub(ref, indices, updates)
+with tf.Session() as sess:
+  print sess.run(sub)
+```
+
+The resulting update to ref would look like this:
+
+    [1, -9, 3, -6, -4, 6, 7, -4]
+
+See `tf.scatter_nd` for more details about how to make updates to
+slices.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNdAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNdAdd.pbtxt
index 5929425bc80f218627a7977a7b4e869715f7963b..b8fbcbbed29de68088db9ee12ae86cde5c7d6aa8 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterNdAdd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNdAdd.pbtxt
@@ -35,14 +35,12 @@ be protected by a lock; otherwise the behavior is undefined,
 but may exhibit less contention.
 END
   }
-  summary: "Applies sparse addition between `updates` and individual values or slices"
+  summary: "Applies sparse addition to individual values or slices in a Variable."
   description: <<END
-within a given variable according to `indices`.
-
 `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 
 `indices` must be integer tensor, containing indices into `ref`.
-It must be shape `\\([d_0, ..., d_{Q-2}, K]\\)` where `0 < K <= P`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
 
 The innermost dimension of `indices` (with length `K`) corresponds to
 indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
@@ -50,17 +48,21 @@ dimension of `ref`.
 
 `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 
-$$[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].$$
+```
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+```
 
-For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
-elements. In Python, that addition would look like this:
+For example, say we want to add 4 scattered elements to a rank-1 tensor to
+8 elements. In Python, that addition would look like this:
 
-    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-    indices = tf.constant([[4], [3], [1], [7]])
-    updates = tf.constant([9, 10, 11, 12])
-    add = tf.scatter_nd_add(ref, indices, updates)
-    with tf.Session() as sess:
-      print sess.run(add)
+```python
+ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+indices = tf.constant([[4], [3], [1], [7]])
+updates = tf.constant([9, 10, 11, 12])
+add = tf.scatter_nd_add(ref, indices, updates)
+with tf.Session() as sess:
+  print sess.run(add)
+```
 
 The resulting update to ref would look like this:
 
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNdSub.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNdSub.pbtxt
index 67346f051e75b68bc98b0e9026849f1c0f512939..b557addb7ce872edb76199a071907c59c8454abb 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterNdSub.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNdSub.pbtxt
@@ -35,14 +35,14 @@ be protected by a lock; otherwise the behavior is undefined,
 but may exhibit less contention.
 END
   }
-  summary: "Applies sparse subtraction between `updates` and individual values or slices"
+  summary: "Applies sparse subtraction to individual values or slices in a Variable."
   description: <<END
 within a given variable according to `indices`.
 
 `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 
 `indices` must be integer tensor, containing indices into `ref`.
-It must be shape \\([d_0, ..., d_{Q-2}, K]\\) where `0 < K <= P`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
 
 The innermost dimension of `indices` (with length `K`) corresponds to
 indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
@@ -50,17 +50,21 @@ dimension of `ref`.
 
 `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 
-$$[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].$$
+```
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+```
 
 For example, say we want to subtract 4 scattered elements from a rank-1 tensor
 with 8 elements. In Python, that subtraction would look like this:
 
-    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-    indices = tf.constant([[4], [3], [1], [7]])
-    updates = tf.constant([9, 10, 11, 12])
-    sub = tf.scatter_nd_sub(ref, indices, updates)
-    with tf.Session() as sess:
-      print sess.run(sub)
+```python
+ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+indices = tf.constant([[4], [3], [1], [7]])
+updates = tf.constant([9, 10, 11, 12])
+sub = tf.scatter_nd_sub(ref, indices, updates)
+with tf.Session() as sess:
+  print sess.run(sub)
+```
 
 The resulting update to ref would look like this:
 
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListResize.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListResize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5b34f8cec7e1c62142d280ad43e11c14afef30e5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListResize.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "TensorListResize"
+  summary: "Resizes the list."
+  description: <<END
+
+input_handle: the input list
+size: size of the output list
+
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
index 7a60e4387ad0078d51eba026fcd2d9454a50e4ec..ed4a2bd5588eecb19d9d5effb386b2fe5c0c4409 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
@@ -3,7 +3,8 @@ op {
   in_arg {
     name: "segment_ids"
     description: <<END
-A tensor whose shape is a prefix of `data.shape`.END
+A tensor whose shape is a prefix of `data.shape`.
+END
   }
   out_arg {
     name: "output"
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceScatterNdSub.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceScatterNdSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ff1d01db6bf5279c99c9305c1eec97ed8b6e84f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceScatterNdSub.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceScatterNdSub"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListResize.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListResize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0d689d4f2b16a9e18064fe9c8be09650a3e4a641
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListResize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListResize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/data/BUILD b/tensorflow/core/common_runtime/data/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..124862dbb73422e7645fe460576ac35c83f018aa
--- /dev/null
+++ b/tensorflow/core/common_runtime/data/BUILD
@@ -0,0 +1,35 @@
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+        "//tensorflow_models:__subpackages__",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_protos_all")
+
+cc_library(
+    name = "standalone",
+    srcs = ["standalone.cc"],
+    hdrs = ["standalone.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:session_options",
+    ],
+)
+
+tf_cc_test(
+    name = "standalone_test",
+    srcs = ["standalone_test.cc"],
+    deps = [
+        ":standalone",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ] + tf_protos_all(),
+)
diff --git a/tensorflow/core/common_runtime/data/standalone.cc b/tensorflow/core/common_runtime/data/standalone.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b05bff566f538970fa857a8a38888cd074a06c2f
--- /dev/null
+++ b/tensorflow/core/common_runtime/data/standalone.cc
@@ -0,0 +1,128 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/data/standalone.h"
+
+#include <memory>
+
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace standalone {
+
+Status Iterator::GetNext(std::vector<Tensor>* outputs, bool* end_of_input) {
+  return iterator_->GetNext(ctx_.get(), outputs, end_of_input);
+}
+
+Iterator::Iterator(IteratorBase* iterator, IteratorContext* ctx)
+    : iterator_(iterator), ctx_(ctx) {}
+
+Status Dataset::FromGraph(Params params, const GraphDef& graph_def,
+                          const string& fetch_node,
+                          std::unique_ptr<Dataset>* result) {
+  Graph graph(OpRegistry::Global());
+  TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, nullptr));
+
+  // Instantiate enough of the TensorFlow runtime to run `graph` on a single CPU
+  // device.
+  std::unique_ptr<DeviceMgr> device_mgr = MakeUnique<DeviceMgr>(
+      DeviceFactory::NewDevice("CPU", params.session_options, ""));
+  Device* device = device_mgr->ListDevices()[0];
+  // Clone the `FunctionLibraryDefinition` to extend its lifetime extends beyond
+  // the lifetime of `graph`.
+  std::unique_ptr<FunctionLibraryDefinition> flib_def =
+      MakeUnique<FunctionLibraryDefinition>(graph.flib_def());
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr =
+      MakeUnique<ProcessFunctionLibraryRuntime>(
+          device_mgr.get(), Env::Default(), TF_GRAPH_DEF_VERSION,
+          flib_def.get(), OptimizerOptions{}, nullptr /* parent */);
+
+  // Run graph up to `output_node` and extract the `DatasetBase` stored in the
+  // DT_VARIANT output tensor.
+  data::DatasetBase* dataset;
+  {
+    std::vector<Tensor> outputs;
+    GraphRunner graph_runner(device);
+    TF_RETURN_IF_ERROR(graph_runner.Run(&graph, pflr->GetFLR("/device:CPU:0"),
+                                        {}, {fetch_node}, &outputs));
+    TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(outputs[0], &dataset));
+    // NOTE(mrry): The dataset is currently owned by `outputs[0]`, so acquire an
+    // additional reference.
+    dataset->Ref();
+  }
+
+  std::unique_ptr<thread::ThreadPool> pool(
+      NewThreadPoolFromSessionOptions(params.session_options));
+  *result =
+      WrapUnique(new Dataset(dataset, device_mgr.release(), pflr.release(),
+                             flib_def.release(), pool.release()));
+  return Status::OK();
+}  // static
+
+Status Dataset::MakeIterator(std::unique_ptr<Iterator>* result) {
+  // Create an `IteratorContext`, which bundles together the necessary runtime
+  // support to create and get elements from an iterator.
+  std::unique_ptr<IteratorContext> ctx;
+  {
+    // NOTE(mrry): In the current API, an `IteratorContext` is always initially
+    // created from an `OpKernelContext*`, so we need to create a fake
+    // `OpKernelContext` with the appropriate subset of parameters.
+    OpKernelContext::Params op_params;
+    op_params.function_library = pflr_->GetFLR("/device:CPU:0");
+    op_params.device = device_mgr_->ListDevices()[0];
+    op_params.runner = &runner_;
+    OpKernelContext op_ctx(&op_params, 0);
+    IteratorContext::Params params(&op_ctx);
+    params.function_handle_cache = function_handle_cache_.get();
+    ctx = MakeUnique<IteratorContext>(std::move(params));
+  }
+
+  // Create the iterator from the dataset.
+  std::unique_ptr<IteratorBase> iterator;
+  TF_RETURN_IF_ERROR(dataset_->MakeIterator(ctx.get(), "iterator", &iterator));
+
+  *result = WrapUnique(new Iterator(iterator.release(), ctx.release()));
+
+  return Status::OK();
+}
+
+Dataset::Dataset(DatasetBase* dataset, DeviceMgr* device_mgr,
+                 ProcessFunctionLibraryRuntime* pflr,
+                 FunctionLibraryDefinition* flib_def, thread::ThreadPool* pool)
+    : dataset_(dataset),
+      device_mgr_(device_mgr),
+      flib_def_(flib_def),
+      pflr_(pflr),
+      pool_(pool) {
+  runner_ = [this](std::function<void()> c) { pool_->Schedule(std::move(c)); };
+  function_handle_cache_ =
+      MakeUnique<FunctionHandleCache>(pflr_->GetFLR("/device:CPU:0"));
+}
+
+Dataset::~Dataset() { dataset_->Unref(); }
+
+}  // namespace standalone
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/data/standalone.h b/tensorflow/core/common_runtime/data/standalone.h
new file mode 100644
index 0000000000000000000000000000000000000000..ecea5ba21d0e807b72808c31336916b5f12cb854
--- /dev/null
+++ b/tensorflow/core/common_runtime/data/standalone.h
@@ -0,0 +1,122 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DATA_STANDALONE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DATA_STANDALONE_H_
+
+#include <memory>
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace data {
+namespace standalone {
+
+// The purpose of the API in this file is to facilitate standalone execution of
+// a tf.data input pipeline graph.
+//
+// The API exposes two abstractions -- a `Dataset` and an `Iterator` -- which
+// encapsulate TensorFlow runtime.
+//
+// The `Dataset` abstraction represents an input pipeline as a collection
+// of data sources and a logical plan of transformations that operate over the
+// data.
+//
+// The `Iterator` abstraction represents an execution of an input pipeline that
+// can be used to enumerate its elements.
+//
+// Example usage:
+//
+//   // Create a `Dataset` by running the `graph_def` graph and fetching the
+//   // output of the `fetch_node` node.
+//   tensorflow::data:standalone::Dataset::Params params;
+//   std::unique_ptr<tensorflow::data::standalone::Dataset> dataset;
+//   Status s = tensorflow::data::standalone::Dataset::FromGraph(
+//      params, graph_def, fetch_node, &dataset);
+//   if (!s.ok()) { /* error handling */ }
+//
+//   std::unique_ptr<tensorflow::data::standalone::Iterator> iterator;
+//   s = dataset->MakeIterator(&iterator);
+//   if (!s.ok()) { /* error handling */ }
+//
+//   bool end_of_input = false;
+//   while (!end_of_input) {
+//     std::vector<tensorflow::Tensor> outputs;
+//     s = iterator->GetNext(&outputs, &end_of_input);
+//     if (!s.ok()) { /* error handling */ }
+//     if (!end_of_input) { /* output handling */ }
+//   }
+
+class Dataset;
+
+// Represents an execution of an input pipeline that can be used to enumerate
+// its elements.
+class Iterator {
+ public:
+  // Returns the next element of the input pipeline (if there is one) and an
+  // indication of whether the end of the input pipeline has been reached.
+  Status GetNext(std::vector<Tensor>* outputs, bool* end_of_input);
+
+ private:
+  friend class Dataset;
+
+  Iterator(IteratorBase* iterator, IteratorContext* ctx);
+
+  std::unique_ptr<IteratorBase> iterator_;
+  std::unique_ptr<IteratorContext> ctx_;
+};
+
+// Represents an input pipeline as a collection of data sources and a logical
+// plan of transformations that operate over the data.
+class Dataset {
+ public:
+  // Parameters for `Dataset` creation (e.g. TensorFlow runtime configuration).
+  struct Params {
+    SessionOptions session_options;
+  };
+
+  // Creates a new `Dataset` instance by running the TensorFlow graph `graph`
+  // and fetching the output of the `fetch_node` node.
+  static Status FromGraph(Params params, const GraphDef& graph_def,
+                          const string& fetch_node,
+                          std::unique_ptr<Dataset>* result);
+
+  ~Dataset();
+
+  // Creates an iterator for this dataset.
+  Status MakeIterator(std::unique_ptr<Iterator>* result);
+
+ private:
+  Dataset(DatasetBase* dataset, DeviceMgr* device_mgr,
+          ProcessFunctionLibraryRuntime* pflr,
+          FunctionLibraryDefinition* flib_def, thread::ThreadPool* pool);
+
+  DatasetBase* dataset_;  // owned
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  std::unique_ptr<thread::ThreadPool> pool_;
+  std::unique_ptr<FunctionHandleCache> function_handle_cache_;
+  std::function<void(std::function<void()>)> runner_;
+};
+
+}  // namespace standalone
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DATA_STANDALONE_H_
diff --git a/tensorflow/core/common_runtime/data/standalone_test.cc b/tensorflow/core/common_runtime/data/standalone_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7e7a7a9b6195c247d94ed137f4bce18cee9851b4
--- /dev/null
+++ b/tensorflow/core/common_runtime/data/standalone_test.cc
@@ -0,0 +1,188 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/data/standalone.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace data {
+namespace standalone {
+namespace {
+
+constexpr const char* const kGraphProto = R"proto(
+  node {
+    name: "Const/_0"
+    op: "Const"
+    attr {
+      key: "dtype"
+      value { type: DT_INT64 }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_INT64
+          tensor_shape {}
+          int64_val: 0
+        }
+      }
+    }
+  }
+  node {
+    name: "Const/_1"
+    op: "Const"
+    attr {
+      key: "dtype"
+      value { type: DT_INT64 }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_INT64
+          tensor_shape {}
+          int64_val: 10
+        }
+      }
+    }
+  }
+  node {
+    name: "Const/_2"
+    op: "Const"
+    attr {
+      key: "dtype"
+      value { type: DT_INT64 }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_INT64
+          tensor_shape {}
+          int64_val: 1
+        }
+      }
+    }
+  }
+  node {
+    name: "RangeDataset/_3"
+    op: "RangeDataset"
+    input: "Const/_0"
+    input: "Const/_1"
+    input: "Const/_2"
+    attr {
+      key: "output_shapes"
+      value { list { shape { unknown_rank: true } } }
+    }
+    attr {
+      key: "output_types"
+      value { list { type: DT_INT64 } }
+    }
+  }
+  node {
+    name: "MapDataset/_4"
+    op: "MapDataset"
+    input: "RangeDataset/_3"
+    attr {
+      key: "Targuments"
+      value { list {} }
+    }
+    attr {
+      key: "f"
+      value { func { name: "Dataset_map_<lambda>_10" } }
+    }
+    attr {
+      key: "output_shapes"
+      value { list { shape {} } }
+    }
+    attr {
+      key: "output_types"
+      value { list { type: DT_INT64 } }
+    }
+    attr {
+      key: "preserve_cardinality"
+      value { b: false }
+    }
+    attr {
+      key: "use_inter_op_parallelism"
+      value { b: true }
+    }
+  }
+  library {
+    function {
+      signature {
+        name: "Dataset_map_<lambda>_10"
+        input_arg { name: "arg0" type: DT_INT64 }
+        output_arg { name: "mul" type: DT_INT64 }
+        description: "Wrapper for passing nested structures to and from tf.data functions."
+      }
+      node_def {
+        name: "mul_0"
+        op: "Mul"
+        input: "arg0"
+        input: "arg0"
+        attr {
+          key: "T"
+          value { type: DT_INT64 }
+        }
+      }
+      ret { key: "mul" value: "mul_0:z:0" }
+    }
+  }
+  versions { producer: 27 min_consumer: 12 }
+)proto";
+
+TEST(Scalar, Standalone) {
+  GraphDef graph_def;
+  protobuf::TextFormat::ParseFromString(kGraphProto, &graph_def);
+  struct TestCase {
+    string fetch_node;
+    std::vector<int64> expected_outputs;
+  };
+  auto test_cases = {
+      TestCase{"RangeDataset/_3", {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}},
+      TestCase{"MapDataset/_4", {0, 1, 4, 9, 16, 25, 36, 49, 64, 81}},
+  };
+  for (auto test_case : test_cases) {
+    std::unique_ptr<Dataset> dataset;
+    auto s = Dataset::FromGraph({}, graph_def, test_case.fetch_node, &dataset);
+    TF_EXPECT_OK(s);
+    std::unique_ptr<Iterator> iterator;
+    s = dataset->MakeIterator(&iterator);
+    TF_EXPECT_OK(s);
+    bool end_of_input = false;
+    for (int num_outputs = 0; !end_of_input; ++num_outputs) {
+      std::vector<tensorflow::Tensor> outputs;
+      s = iterator->GetNext(&outputs, &end_of_input);
+      TF_EXPECT_OK(s);
+      if (!end_of_input) {
+        EXPECT_EQ(outputs[0].scalar<int64>()(),
+                  test_case.expected_outputs[num_outputs]);
+      } else {
+        EXPECT_EQ(test_case.expected_outputs.size(), num_outputs);
+      }
+    }
+  }
+}
+
+}  // namespace
+}  // namespace standalone
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 51b2c68c769ec4d1b675131de93eca09f0648862..36f1a92aab18ae327feab9a216141dccd3e2318e 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -59,6 +59,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
+#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -303,10 +304,8 @@ DirectSession::DirectSession(const SessionOptions& options,
   if (!status.ok()) {
     LOG(ERROR) << status.error_message();
   }
-  // NOTE(mrry): We do not need to use a unique string for the session
-  // handle, because DirectSession owns its devices. This may change
-  // in future versions.
-  session_handle_ = "direct";
+  session_handle_ =
+      strings::StrCat("direct", strings::FpToString(random::New64()));
   int devices_added = 0;
   if (options.config.log_device_placement()) {
     const string mapping_str = device_mgr_->DeviceMappingString();
@@ -371,6 +370,7 @@ Status DirectSession::MaybeInitializeExecutionState(
   GraphExecutionStateOptions options;
   options.device_set = &device_set_;
   options.session_options = &options_;
+  options.session_handle = session_handle_;
   // TODO(mrry,suharshs): We explicitly copy `graph` so that
   // `MakeForBaseGraph()` can take ownership of its
   // contents. Previously this happened implicitly in calls to the
@@ -533,6 +533,7 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
   CancellationManager step_cancellation_manager;
   args.cancellation_manager = &step_cancellation_manager;
   args.session_state = &session_state_;
+  args.session_handle = session_handle_;
   args.tensor_store = &run_state.tensor_store;
   args.step_container = &run_state.step_container;
   args.sync_on_finish = sync_on_finish_;
@@ -888,6 +889,7 @@ Status DirectSession::PRunSetup(const std::vector<string>& input_names,
     SchedClosure(pool, std::move(c));
   };
   args.session_state = &session_state_;
+  args.session_handle = session_handle_;
   args.tensor_store = &run_state->tensor_store;
   args.step_container = &run_state->step_container;
   if (LogMemory::IsEnabled()) {
@@ -1465,6 +1467,7 @@ Status DirectSession::CreateGraphs(
     prune_options.device_set = &device_set_;
     prune_options.session_options = &options_;
     prune_options.stateful_placements = stateful_placements_;
+    prune_options.session_handle = session_handle_;
     TF_RETURN_IF_ERROR(GraphExecutionState::MakeForPrunedGraph(
         execution_state_->original_graph_def().library(), prune_options,
         execution_state_->original_graph_def(), subgraph_options,
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 6754e9cfb71700090049107cf4dd122175527ffe..bcac34154407eb461a80fd3d638ee51a88f3d7fa 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -317,6 +317,7 @@ class DirectSession : public Session {
   std::vector<Device*> devices_;  // not owned
   DeviceSet device_set_;
 
+  // Unique session identifier.
   string session_handle_;
   mutex graph_state_lock_;
   bool graph_created_ GUARDED_BY(graph_state_lock_) = false;
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index 77e3246df045785df5908c263edbf668762acc38..cabbddb77d391fe19dcc0ff4cc7c54f94ea41dae 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -88,6 +88,34 @@ tf_cuda_library(
     ],
 )
 
+tf_cuda_library(
+    name = "profiler",
+    srcs = [
+        "profiler.cc",
+    ],
+    hdrs = [
+        "profiler.h",
+    ],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":context",
+        "//tensorflow/cc/profiler",
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:core_cpu_lib",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:lib_internal",
+            "//tensorflow/core:protos_all_cc",
+            "//tensorflow/core:session_options",
+        ],
+    }),
+)
+
 tf_cuda_library(
     name = "tensor_handle",
     srcs = [
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.cc b/tensorflow/core/common_runtime/eager/attr_builder.cc
index a750f8cbba4de4abd33d6ec395b6b0a5fb76cc67..689b04274fe76046700ec0dbbb5cc55b2931409f 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder.cc
@@ -125,6 +125,7 @@ Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out,
   template <>                                                                \
   AttrBuilder& AttrBuilder::Set(StringPiece attr_name, value_type&& value) { \
     value_field.push_back(std::make_pair(string(attr_name), value));         \
+    cached_cache_key_ = absl::nullopt;                                       \
     return *this;                                                            \
   }
 
@@ -231,7 +232,17 @@ inline tensorflow::Fprint128 CacheKeyHelper(StringPiece s, uint64 b) {
 
 }  // namespace
 
-tensorflow::Fprint128 AttrBuilder::CacheKey(const string& device) const {
+tensorflow::Fprint128 AttrBuilder::CacheKey(const string& device) {
+  if (!cached_cache_key_ || device != device_for_cached_cache_key_) {
+    cached_cache_key_ = BuildCacheKeyForDevice(device);
+    device_for_cached_cache_key_ = device;
+  }
+
+  return *cached_cache_key_;
+}
+
+tensorflow::Fprint128 AttrBuilder::BuildCacheKeyForDevice(
+    const string& device) const {
   tensorflow::Fprint128 f = tensorflow::Fingerprint128(op_name_);
   f = tensorflow::FingerprintCat128(f, tensorflow::Fingerprint128(device));
   if (node_def_ != nullptr) {
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.h b/tensorflow/core/common_runtime/eager/attr_builder.h
index 5e0172dfd328dbd4f16abdce879be1d1338e692c..aa64b5f59bd0cb54b1872c0328c10ebb1de622b6 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.h
+++ b/tensorflow/core/common_runtime/eager/attr_builder.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/util/tensor_slice_reader_cache.h"
 
@@ -74,7 +75,7 @@ Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
 // AttrBuilder a;
 // a.NumInputs(2);
 // a.Set("T", TF_FLOAT);
-// uint64 cache_key = a.CacheKey("cpu:0");
+// tensorflow::Fprint128 cache_key = a.CacheKey("cpu:0");
 // const NodeDef& n = a.BuildNodeDef();
 //
 // Note that all calls to Set and NumInputs should happen before calling
@@ -100,10 +101,11 @@ class AttrBuilder {
   AttrBuilder& Set(StringPiece attr_name, T&& value) {
     MayBeInitializeNodeDef();
     SetInAttrValueMap(node_def_->mutable_attr(), string(attr_name), value);
+    cached_cache_key_ = absl::nullopt;
     return *this;
   }
 
-  tensorflow::Fprint128 CacheKey(const string& device) const;
+  tensorflow::Fprint128 CacheKey(const string& device);
 
   void FillAttrValueMap(AttrValueMap* m) const { FillAttrValueMap(m, true); }
   const NodeDef& BuildNodeDef();
@@ -112,6 +114,8 @@ class AttrBuilder {
   template <class T>
   using AttrVec = tensorflow::gtl::InlinedVector<std::pair<string, T>, 2>;
 
+  tensorflow::Fprint128 BuildCacheKeyForDevice(const string& device) const;
+
   void MayBeInitializeNodeDef();
   // Fill `m` with the attr-value pairs set via AttrBuilder::Set() so far, as
   // well as any default attr-value pairs from the associated op_def, if there
@@ -148,6 +152,9 @@ class AttrBuilder {
   int num_inputs_;
   std::unique_ptr<NodeDef> node_def_;
   bool node_def_finalized_;
+
+  absl::optional<tensorflow::Fprint128> cached_cache_key_;
+  string device_for_cached_cache_key_;
 };  // namespace tensorflow
 
 template <>
diff --git a/tensorflow/core/common_runtime/eager/attr_builder_test.cc b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
index 220cc6f5ce0bff32cfdc8d4e837c6900c773728e..8245660bfcacc29b3345b4fbeec56da0acb8061b 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder_test.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
@@ -67,5 +67,18 @@ TEST(AttrTypeMap, Lookup) {
   EXPECT_NE(is_list, 0);
 }
 
+TEST(AttrTypeMap, CacheKey) {
+  AttrBuilder a("op_name");
+  a.NumInputs(2);
+  a.Set("T", TF_FLOAT);
+  tensorflow::Fprint128 cache_key = a.CacheKey("cpu:0");
+
+  ASSERT_FALSE(cache_key == a.CacheKey("cpu:1"));
+  ASSERT_TRUE(cache_key == a.CacheKey("cpu:0"));
+
+  a.Set("x", 1.0);
+  ASSERT_FALSE(cache_key == a.CacheKey("cpu:0"));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 2212bda53449c1944a75318725eec0faf46438f1..1d93d2bbe6f95250c9e49f20fb24b42933b43422 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -234,6 +234,29 @@ Status EagerContext::FindDeviceByName(const string& name, Device** result) {
   return Status::OK();
 }
 
+void EagerContext::ClearRunMetadata() {
+  if (metadata_listener_ != nullptr) {
+    metadata_listener_->BeforeClearRunMetadata();
+  }
+  run_metadata_.Clear();
+}
+
+Status EagerContext::RegisterRunMetadataListener(
+    RunMetadataListener* listener) {
+  mutex_lock l(metadata_mu_);
+  if (metadata_listener_ != nullptr) {
+    return Status(error::Code::INVALID_ARGUMENT,
+                  "Cannot run two eager profiler at the same time");
+  }
+  metadata_listener_ = listener;
+  return Status::OK();
+}
+
+void EagerContext::ClearRunMetadataListener() {
+  mutex_lock l(metadata_mu_);
+  metadata_listener_ = nullptr;
+}
+
 void EagerContext::StartStep() {
   mutex_lock ml(metadata_mu_);
   num_active_steps_++;
@@ -317,10 +340,15 @@ void EagerContext::AddKernelToCache(Fprint128 cache_key,
   gtl::InsertOrUpdate(&kernel_cache_, cache_key, kernel);
 }
 
+bool EagerContext::ShouldStoreMetadata() {
+  mutex_lock ml(metadata_mu_);
+  return should_store_metadata_.load() || metadata_listener_ != nullptr;
+}
+
 void EagerContext::SetShouldStoreMetadata(bool value) {
+  mutex_lock ml(metadata_mu_);
   should_store_metadata_.store(value);
-  if (!value) {
-    mutex_lock ml(metadata_mu_);
+  if (!value || metadata_listener_ != nullptr) {
     run_metadata_.Clear();
   }
 }
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 5ff6b3ffbdd9ed7a6aa2e56e1ddb8648f9265ef0..b83f5707dae15a1958b68d15fc2b104f9981695d 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/example/example.pb.h"
 #ifndef __ANDROID__
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
@@ -66,6 +67,12 @@ enum ContextDevicePlacementPolicy {
   DEVICE_PLACEMENT_SILENT_FOR_INT32 = 3,
 };
 
+class RunMetadataListener {
+ public:
+  virtual ~RunMetadataListener() {}
+  virtual void BeforeClearRunMetadata() = 0;
+};
+
 class EagerContext {
  public:
   // TODO: remove this constructor once we migrate all callers to the next one.
@@ -172,10 +179,15 @@ class EagerContext {
   void ReleaseDeviceMgr() { local_device_manager_.release(); }
 
   // TODO(apassos) clean up RunMetadata storage.
-  mutex* MetadataMu() { return &metadata_mu_; }
-  bool ShouldStoreMetadata() { return should_store_metadata_.load(); }
+  mutex* MetadataMu() LOCK_RETURNED(metadata_mu_) { return &metadata_mu_; }
+  bool ShouldStoreMetadata() LOCKS_EXCLUDED(metadata_mu_);
   void SetShouldStoreMetadata(bool value);
   RunMetadata* RunMetadataProto() { return &run_metadata_; }
+  void ClearRunMetadata() EXCLUSIVE_LOCKS_REQUIRED(metadata_mu_);
+
+  Status RegisterRunMetadataListener(RunMetadataListener* listener)
+      LOCKS_EXCLUDED(metadata_mu_);
+  void ClearRunMetadataListener() LOCKS_EXCLUDED(metadata_mu_);
 
   void StartStep();
   void EndStep();
@@ -269,6 +281,7 @@ class EagerContext {
   std::atomic<bool> should_store_metadata_{false};
   mutex metadata_mu_;
   RunMetadata run_metadata_ GUARDED_BY(metadata_mu_);
+  RunMetadataListener* metadata_listener_ GUARDED_BY(metadata_mu_) = nullptr;
   GraphCollector graph_collector_;
   const bool log_device_placement_;
   // EagerExecutor for async execution.
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 79806c3c732b684c6cca44480264c37e676c0da2..9d2401929d70989c66b94fca6460ac586e58f8fe 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -752,8 +752,8 @@ Status EagerExecute(EagerContext* ctx, Device* device,
     maybe_stats->set_all_end_rel_micros(nanos / EnvTime::kMicrosToNanos -
                                         maybe_stats->all_start_micros());
     maybe_stats->set_all_end_rel_nanos(nanos - maybe_stats->all_start_nanos());
-    mutex_lock ml(*ctx->MetadataMu());
     if (ctx->ShouldStoreMetadata()) {
+      mutex_lock ml(*ctx->MetadataMu());
       {
         GraphCollector* collector = ctx->GetGraphCollector();
         mutex_lock mll(collector->mu);
diff --git a/tensorflow/core/common_runtime/eager/profiler.cc b/tensorflow/core/common_runtime/eager/profiler.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d670d6f3440c48e45e1b923452977e8ec25f0d16
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/profiler.cc
@@ -0,0 +1,77 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/eager/profiler.h"
+#include "tensorflow/cc/profiler/profiler.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+/*static*/ std::unique_ptr<EagerProfiler> EagerProfiler::Create(
+    EagerContext* const context) {
+  return absl::WrapUnique(new EagerProfiler(context));
+}
+
+void EagerProfiler::BeforeClearRunMetadata() {
+  mutex_lock l(mutex_);
+  run_metadata_.MergeFrom(*context_->RunMetadataProto());
+}
+
+Status EagerProfiler::Status() {
+  mutex_lock l(mutex_);
+  return status_;
+}
+
+Status EagerProfiler::SerializeToString(string* content) {
+  {
+    mutex_lock l(mutex_);
+    if (!status_.ok()) return status_;
+  }
+  RunMetadata metadata;
+  GetMergetRunMetadata(&metadata);
+
+  // TODO(fishx): update tfprof to use a lighter representation instead of
+  // GraphDef.
+  GraphDef graph;
+  std::unique_ptr<tfprof::Profiler> tfprof(new tfprof::Profiler(graph));
+  tfprof->AddStep(0, metadata);
+  return tfprof->SerializeToString(content);
+}
+
+EagerProfiler::EagerProfiler(EagerContext* const context) : context_(context) {
+  LOG(INFO) << "Eager Profiler started.";
+
+  status_ = context_->RegisterRunMetadataListener(this);
+  if (!status_.ok()) {
+    LOG(INFO) << "Eager Profiler failed to start. Another profiler is running.";
+    return;
+  }
+}
+
+EagerProfiler::~EagerProfiler() {
+  context_->ClearRunMetadataListener();
+  LOG(INFO) << "Eager Profiler ended with status:" << status_;
+}
+
+void EagerProfiler::GetMergetRunMetadata(RunMetadata* metadata) {
+  mutex_lock ml(*context_->MetadataMu());
+  mutex_lock l(mutex_);
+  *metadata = run_metadata_;
+  metadata->MergeFrom(*context_->RunMetadataProto());
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/profiler.h b/tensorflow/core/common_runtime/eager/profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..cadbfdb498854cf35166e1eb20901d6c8894bd55
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/profiler.h
@@ -0,0 +1,63 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_PROFILER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_PROFILER_H_
+
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+// A profiler which will start profiling when creating the object and will stop
+// when the object is destroyed. It will profile all operations run under the
+// given EagerContext.
+// Multiple instances of it can be created, but at most one of them will profile
+// for each EagerContext. Status() will return OK only for the instance that is
+// profiling.
+// Thread-safety: TFE_Profiler is thread-safe.
+class EagerProfiler : RunMetadataListener {
+ public:
+  // Creates and EagerProfiler and starts profiling.
+  static std::unique_ptr<EagerProfiler> Create(EagerContext* const context);
+
+  // Deletes an exsiting Profiler and enables starting a new one.
+  ~EagerProfiler() override;
+
+  void BeforeClearRunMetadata() override LOCKS_EXCLUDED(mutex_)
+      EXCLUSIVE_LOCKS_REQUIRED(context_->MetadataMu());
+  tensorflow::Status Status() LOCKS_EXCLUDED(mutex_);
+
+  tensorflow::Status SerializeToString(string* content) LOCKS_EXCLUDED(mutex_);
+
+ private:
+  // Constructs an instance of the class and starts profiling
+  explicit EagerProfiler(EagerContext* const context);
+
+  // Profiler is neither copyable or movable.
+  EagerProfiler(const EagerProfiler&) = delete;
+  EagerProfiler& operator=(const EagerProfiler&) = delete;
+
+  void GetMergetRunMetadata(RunMetadata* metadata) LOCKS_EXCLUDED(mutex_);
+
+  RunMetadata run_metadata_ GUARDED_BY(mutex_);
+  tensorflow::Status status_ GUARDED_BY(mutex_);
+  EagerContext* const context_;
+  mutex mutex_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_PROFILER_H_
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index df2ee6c61822d3a4cea88a70096baab9596c6c9f..07c8c4a5d45a5043677ff3c6c2c31ba00594fa44 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1244,6 +1244,7 @@ class ExecutorState {
   Rendezvous* rendezvous_;
   CollectiveExecutor* collective_executor_ = nullptr;
   SessionState* session_state_;
+  string session_handle_;
   TensorStore* tensor_store_;
   // Step-local container.
   ScopedStepContainer* step_container_;
@@ -1371,6 +1372,7 @@ ExecutorState::ExecutorState(const Executor::Args& args, ExecutorImpl* impl)
       rendezvous_(args.rendezvous),
       collective_executor_(args.collective_executor),
       session_state_(args.session_state),
+      session_handle_(args.session_handle),
       tensor_store_(args.tensor_store),
       step_container_(args.step_container),
       stats_collector_(args.stats_collector),
@@ -1616,6 +1618,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
   params.rendezvous = rendezvous_;
   params.collective_executor = collective_executor_;
   params.session_state = session_state_;
+  params.session_handle = session_handle_;
   params.tensor_store = tensor_store_;
   params.cancellation_manager = cancellation_manager_;
   params.call_frame = call_frame_;
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index 02930168a4b053895827a54d065011bc9d657463..4be60c67713bc801a8249201d65a5dbc26646138 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -88,6 +88,8 @@ class Executor {
     CallFrameInterface* call_frame = nullptr;
     CancellationManager* cancellation_manager = nullptr;
     SessionState* session_state = nullptr;
+    // Unique session identifier. Can be empty.
+    string session_handle;
     TensorStore* tensor_store = nullptr;
     ScopedStepContainer* step_container = nullptr;
     CollectiveExecutor* collective_executor = nullptr;
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 7eb622dc117f40a68079e6cea1a829227acfed7a..48f32df27571aed2fd1fdc613f1b9aa4f660f665 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -104,6 +104,10 @@ static Node* AddIdentity(Graph* g, Endpoint input) {
   NodeDef ndef;
   ndef.set_name(g->NewName(kNodeLabel));
   ndef.set_op("Identity");
+  // NOTE(skyewm): we explicitly set the device here to address a multi-GPU
+  // performance issue where this Identity would be placed alone on a GPU,
+  // causing unnecessary device traffic. See b/122483225 for details.
+  ndef.set_device(input.node->def().device());
   ndef.add_input(input.name());
   AddNodeAttr("T", BaseType(input.dtype()), &ndef);
   Status s;
@@ -453,7 +457,9 @@ class CallOp : public AsyncOpKernel {
   CallOp(FunctionLibraryRuntime::Handle handle, OpKernelConstruction* ctx)
       : AsyncOpKernel(ctx), handle_(handle) {}
 
-  ~CallOp() override {}
+  ~CallOp() override {
+    // TODO(iga): Release the cached handle_
+  }
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     FunctionLibraryRuntime* lib = ctx->function_library();
@@ -628,11 +634,20 @@ bool FunctionLibraryRuntimeImpl::IsLocalTarget(
     const InstantiateOptions& options) {
   if (device_ == nullptr) return true;
   if (options.target.empty()) return true;
+  if (options.is_multi_device_function) return false;
   Device* target_device;
   if (!device_mgr_->LookupDevice(options.target, &target_device).ok()) {
+    VLOG(1) << "Not instantiating function in FLR because failed to "
+            << "find device " << options.target << " in device manager";
+    return false;
+  }
+  if (target_device != device_) {
+    VLOG(1) << "Not instantiating function in FLR because target device "
+            << options.target
+            << " is different from FLR's device: " << device_->DebugString();
     return false;
   }
-  return target_device == device_;
+  return true;
 }
 
 Status FunctionLibraryRuntimeImpl::Instantiate(
@@ -732,15 +747,32 @@ Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) {
   if (h == kInvalidLocalHandle) {
     return parent_->ReleaseHandle(handle);
   }
-  mutex_lock l(mu_);
-  CHECK_EQ(1, items_.count(h));
-  std::unique_ptr<Item>& item = items_[h];
-  --item->instantiation_counter;
-  if (item->instantiation_counter == 0) {
-    items_.erase(h);
-    TF_RETURN_IF_ERROR(parent_->RemoveHandle(handle));
+
+  std::unique_ptr<Item> item_to_delete;
+  Status parent_status;
+  {
+    mutex_lock l(mu_);
+    auto it = items_.find(h);
+    if (it == items_.end()) {
+      return errors::Internal(
+          "Inconsistent FunctionLibraryRuntime. Expected to find an item for "
+          "handle ",
+          h, " but found none");
+    }
+    std::unique_ptr<Item>& item = it->second;
+    --item->instantiation_counter;
+    if (item->instantiation_counter == 0) {
+      // We don't simply erase h's item because that would trigger
+      // item destruction while holding mu_. Item destruction can
+      // trigger graph destruction. If the graph contains kernels like
+      // CallOp or PartitionCallOp, their destructors will release cached
+      // function handles, resulting in deadlock here.
+      item_to_delete = std::move(item);
+      items_.erase(h);
+      parent_status = parent_->RemoveHandle(handle);
+    }
   }
-  return Status::OK();
+  return parent_status;
 }
 
 void DumpGraph(StringPiece label, const Graph* g) {
@@ -1407,6 +1439,12 @@ void InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
     if (override_device || ndef.device().empty()) {
       ndef.set_device(caller->def().device());
     }
+    for (auto& attr : *ndef.mutable_attr()) {
+      if (attr.first == "_class") {
+        attr.second.set_s(
+            strings::StrCat(caller->name(), "/", attr.second.s()));
+      }
+    }
     Node* clone = g->AddNode(ndef, &s);
     TF_CHECK_OK(s);
     node_map[n->id()] = clone;
@@ -1586,6 +1624,13 @@ void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty) {
     for (const auto& attr : n->attrs()) {
       (*ndef->mutable_attr())[attr.first] = attr.second;
     }
+
+    if (!n->assigned_device_name().empty()) {
+      ndef->set_device(n->assigned_device_name());
+    } else {
+      ndef->set_device(n->requested_device());
+    }
+
     inputs.clear();
     inputs.resize(n->num_inputs());
     for (const Edge* e : n->in_edges()) {
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index cab95cb596858f99285c3cfc5673f87b70368a32..97e46f406cf96cc284ec14718f9500767f5e9861 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -246,9 +246,10 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     if (!status.ok()) return status;
 
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
-    EXPECT_TRUE(errors::IsInvalidArgument(status2));
-    EXPECT_TRUE(
-        str_util::StrContains(status2.error_message(), "remote execution."));
+    EXPECT_TRUE(errors::IsNotFound(status2))
+        << "Actual status: " << status2.ToString();
+    EXPECT_TRUE(str_util::StrContains(status2.error_message(), "Handle"));
+    EXPECT_TRUE(str_util::StrContains(status2.error_message(), "not found"));
 
     return status;
   }
@@ -316,9 +317,9 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     if (!status.ok()) return status;
 
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
-    EXPECT_TRUE(errors::IsInvalidArgument(status2));
-    EXPECT_TRUE(
-        str_util::StrContains(status2.error_message(), "remote execution."));
+    EXPECT_TRUE(errors::IsNotFound(status2));
+    EXPECT_TRUE(str_util::StrContains(status2.error_message(), "Handle"));
+    EXPECT_TRUE(str_util::StrContains(status2.error_message(), "not found"));
 
     return status;
   }
diff --git a/tensorflow/core/common_runtime/function_threadpool_test.cc b/tensorflow/core/common_runtime/function_threadpool_test.cc
index 1b803736fb881c8f133198ab39e5801a357c5659..1dca25e0064e12c9b21c76102278e1bebdc67a4a 100644
--- a/tensorflow/core/common_runtime/function_threadpool_test.cc
+++ b/tensorflow/core/common_runtime/function_threadpool_test.cc
@@ -149,9 +149,9 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     if (!status.ok()) return status;
 
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
-    EXPECT_TRUE(errors::IsInvalidArgument(status2));
-    EXPECT_TRUE(
-        str_util::StrContains(status2.error_message(), "remote execution."));
+    EXPECT_TRUE(errors::IsNotFound(status2));
+    EXPECT_TRUE(str_util::StrContains(status2.error_message(), "Handle"));
+    EXPECT_TRUE(str_util::StrContains(status2.error_message(), "not found"));
 
     return status;
   }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
index 989ddbe4af53ee200f994ea8e3f2ae42e5bcab7f..c22bfcea2cedab93409d761686d852a5c4bbeeb9 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
@@ -44,8 +44,9 @@ bool CheckMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
   se::DeviceMemory<int64> gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}};
   int64 tmp[MASK_WORDS];
 
-  if (!exec->SynchronousMemcpy(&tmp, gpu_ptr, MASK_BYTES)) {
-    LOG(FATAL) << "Could not copy debug mask";
+  Status result = exec->SynchronousMemcpyD2H(gpu_ptr, MASK_BYTES, tmp);
+  if (!result.ok()) {
+    LOG(FATAL) << "Could not copy debug mask, " << result;
   }
 
   bool ok = true;
@@ -63,8 +64,9 @@ bool CheckMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
 
 void InitMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
   se::DeviceMemory<int64> gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}};
-  if (!exec->SynchronousMemcpy(&gpu_ptr, mask, MASK_BYTES)) {
-    LOG(FATAL) << "Could not copy debug mask";
+  Status result = exec->SynchronousMemcpyH2D(mask, MASK_BYTES, &gpu_ptr);
+  if (!result.ok()) {
+    LOG(FATAL) << "Could not copy debug mask, " << result;
   }
 }
 
@@ -171,8 +173,10 @@ void* GPUNanResetAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   se::DeviceMemory<float> nan_ptr{
       se::DeviceMemoryBase{static_cast<float*>(allocated_ptr), req_size}};
 
-  if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) {
-    LOG(ERROR) << "Could not initialize to NaNs";
+  Status result =
+      stream_exec_->SynchronousMemcpyH2D(&nans[0], req_size, &nan_ptr);
+  if (!result.ok()) {
+    LOG(ERROR) << "Could not initialize to NaNs, " << result;
   }
 
   return allocated_ptr;
@@ -185,8 +189,10 @@ void GPUNanResetAllocator::DeallocateRaw(void* ptr) {
                             std::nanf(""));
     se::DeviceMemory<float> nan_ptr{
         se::DeviceMemoryBase{static_cast<float*>(ptr), req_size}};
-    if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) {
-      LOG(ERROR) << "Could not initialize to NaNs";
+    Status result =
+        stream_exec_->SynchronousMemcpyH2D(&nans[0], req_size, &nan_ptr);
+    if (!result.ok()) {
+      LOG(ERROR) << "Could not initialize to NaNs, " << result;
     }
   }
 
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 04d658f0472e3ea07855f4bae6a89ad5199eb2f9..9ecbc34f5fee204a5fe52ab93d74ea542e043edf 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -59,6 +59,7 @@ GraphExecutionState::GraphExecutionState(
     : stateful_placements_(options.stateful_placements),
       device_set_(options.device_set),
       session_options_(options.session_options),
+      session_handle_(options.session_handle),
       flib_def_(new FunctionLibraryDefinition(OpRegistry::Global(),
                                               graph_def->library())),
       graph_(nullptr) {
@@ -198,6 +199,7 @@ Status GraphExecutionState::Extend(
   GraphExecutionStateOptions combined_options;
   combined_options.device_set = device_set_;
   combined_options.session_options = session_options_;
+  combined_options.session_handle = session_handle_;
   combined_options.stateful_placements = stateful_placements_;
 
   // NOTE(mrry): `gdef` is no longer valid after the constructor
@@ -558,6 +560,7 @@ Status GraphExecutionState::InitBaseGraph(const BuildGraphOptions& options) {
   RestoreStatefulNodes(new_graph.get());
 
   GraphOptimizationPassOptions optimization_options;
+  optimization_options.session_handle = session_handle_;
   optimization_options.session_options = session_options_;
   optimization_options.graph = &new_graph;
   optimization_options.flib_def = flib_def_.get();
diff --git a/tensorflow/core/common_runtime/graph_execution_state.h b/tensorflow/core/common_runtime/graph_execution_state.h
index 9cabe478a68a72252579755dca1e8957242344ba..56315bb1ef7947d788a7ada6ef0fa14f50e2a978 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.h
+++ b/tensorflow/core/common_runtime/graph_execution_state.h
@@ -41,6 +41,8 @@ struct RewriteGraphMetadata;
 struct GraphExecutionStateOptions {
   const DeviceSet* device_set = nullptr;
   const SessionOptions* session_options = nullptr;
+  // Unique session identifier. Can be empty.
+  string session_handle;
   // A map from node name to device name, representing the unchangeable
   // placement of stateful nodes.
   std::unordered_map<string, string> stateful_placements;
@@ -192,6 +194,8 @@ class GraphExecutionState {
   GraphDef original_graph_def_;            // Immutable after ctor.
   const DeviceSet* device_set_;            // Not owned
   const SessionOptions* session_options_;  // Not owned
+  // Unique session identifier. Can be empty.
+  string session_handle_;
 
   // Map from name to Node for the full graph in placed_.
   NodeNameToCostIdMap node_name_to_cost_id_map_;
diff --git a/tensorflow/core/common_runtime/optimization_registry.h b/tensorflow/core/common_runtime/optimization_registry.h
index 6fcd2afd2752007996d16358d5118211357fe6c6..0e31f389aa71a5734b1f11b95a056c0d07aabeb9 100644
--- a/tensorflow/core/common_runtime/optimization_registry.h
+++ b/tensorflow/core/common_runtime/optimization_registry.h
@@ -35,6 +35,7 @@ struct SessionOptions;
 // as a key into a state dictionary if it wants to keep state across
 // calls.
 struct GraphOptimizationPassOptions {
+  // Filled in by DirectSession for PRE_PLACEMENT optimizations. Can be empty.
   string session_handle;
   const SessionOptions* session_options = nullptr;
   const CostModel* cost_model = nullptr;
@@ -94,6 +95,10 @@ class OptimizationPassRegistry {
   void Register(Grouping grouping, int phase,
                 std::unique_ptr<GraphOptimizationPass> pass);
 
+  const std::map<Grouping, GraphOptimizationPasses>& groups() {
+    return groups_;
+  }
+
   // Run all passes in grouping, ordered by phase, with the same
   // options.
   Status RunGrouping(Grouping grouping,
diff --git a/tensorflow/core/common_runtime/partitioning_utils.cc b/tensorflow/core/common_runtime/partitioning_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d51caaea8f1d12b472232718c973749e47146728
--- /dev/null
+++ b/tensorflow/core/common_runtime/partitioning_utils.cc
@@ -0,0 +1,143 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/partitioning_utils.h"
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_partition.h"
+
+namespace tensorflow {
+
+Status PartitionFunctionGraph(
+    const DeviceSet& device_set, std::unique_ptr<Graph> graph,
+    std::unordered_map<string, std::unique_ptr<Graph>>* subgraphs) {
+  PartitionOptions partition_options;
+  partition_options.node_to_loc = [](const Node* node) {
+    // TODO(iga): To support the distributed case, first split the graph by
+    // worker (e.g,. using the master session's `SplitByWorker` policy), and
+    // then recursively partition the per-worker shards at the remote worker(s).
+    // Currently, we simply split the graph at device boundaries.
+    return node->assigned_device_name();
+  };
+  int64 edge_name_counter = 0;
+  partition_options.new_name = [&edge_name_counter](const string& prefix) {
+    return strings::StrCat(prefix, "/_", ++edge_name_counter);
+  };
+  partition_options.get_incarnation =
+      [&device_set](const string& name) -> int64 {
+    const Device* d = device_set.FindDeviceByName(name);
+    if (d == nullptr) {
+      return PartitionOptions::kIllegalIncarnation;
+    } else {
+      return d->attributes().incarnation();
+    }
+  };
+  partition_options.control_flow_added = false;
+  std::unordered_map<string, GraphDef> partitions;
+  TF_RETURN_IF_ERROR(Partition(partition_options, graph.get(), &partitions));
+
+  for (const auto& partition : partitions) {
+    const string& device = partition.first;
+    const GraphDef& graph_def = partition.second;
+    // Each partition gets a copy of all the
+    // std::unique_ptr<Graph> subgraph(new Graph(graph->flib_def()));
+    std::unique_ptr<Graph> subgraph(
+        new Graph(graph->flib_def().ReachableDefinitions(graph_def)));
+    FunctionLibraryDefinition global_flib(OpRegistry::Global(), {});
+    TF_CHECK_OK(subgraph->AddFunctionLibrary(global_flib.ToProto()));
+    GraphConstructorOptions opts;
+    opts.allow_internal_ops = true;
+    opts.expect_device_spec = true;
+    TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, graph_def, subgraph.get()));
+    subgraphs->emplace(device, std::move(subgraph));
+  }
+
+  return Status::OK();
+}
+
+Status UpdateArgAndRetvalMetadata(
+    Graph* subgraph, std::vector<int>* arg_indices,
+    std::vector<int>* ret_indices,
+    std::vector<AllocatorAttributes>* arg_alloc_attrs,
+    std::vector<AllocatorAttributes>* ret_alloc_attrs) {
+  std::vector<std::pair<Node*, int>> arg_nodes;
+  std::vector<std::pair<Node*, int>> ret_nodes;
+  const AttrValue* attr_value;
+
+  // Find the Arg and Retval nodes, along with their corresponding indices
+  // in the original function.
+  for (Node* node : subgraph->op_nodes()) {
+    string node_type = node->type_string();
+    if (node_type == FunctionLibraryDefinition::kArgOp) {
+      TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
+      int index = static_cast<int>(attr_value->i());
+      arg_indices->push_back(index);
+      arg_nodes.push_back(std::make_pair(node, index));
+    } else if (node_type == FunctionLibraryDefinition::kRetOp) {
+      TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
+      int index = static_cast<int>(attr_value->i());
+      ret_indices->push_back(index);
+      ret_nodes.push_back(std::make_pair(node, index));
+    }
+  }
+
+  for (int i = 0; i < arg_nodes.size(); ++i) {
+    Node* arg = arg_nodes[i].first;
+    arg->AddAttr("index", i);
+    TF_RETURN_IF_ERROR(arg->attrs().Find("T", &attr_value));
+    AllocatorAttributes alloc_attr;
+    DataType type = attr_value->type();
+    if (MTypeFromDType(type) == HOST_MEMORY) {
+      alloc_attr.set_on_host(true);
+    }
+    arg_alloc_attrs->push_back(alloc_attr);
+  }
+  for (int i = 0; i < ret_nodes.size(); ++i) {
+    Node* ret = ret_nodes[i].first;
+    ret->AddAttr("index", i);
+    TF_RETURN_IF_ERROR(ret->attrs().Find("T", &attr_value));
+    AllocatorAttributes alloc_attr;
+    DataType type = attr_value->type();
+    if (MTypeFromDType(type) == HOST_MEMORY) {
+      alloc_attr.set_on_host(true);
+    }
+    ret_alloc_attrs->push_back(alloc_attr);
+  }
+
+  return Status::OK();
+}
+
+std::vector<Tensor> GetArgsForIndices(const std::vector<int>& indices,
+                                      gtl::ArraySlice<Tensor> arguments) {
+  std::vector<Tensor> args;
+  args.reserve(indices.size());
+  for (int i : indices) {
+    args.push_back(arguments[i]);
+  }
+  return args;
+}
+
+string FunctionNameGenerator::GetName() {
+  for (;; ++counter_) {
+    const string candidate = strings::StrCat(name_, "_", counter_);
+    if (flib_def_->Find(candidate) == nullptr) {
+      return candidate;
+    }
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/partitioning_utils.h b/tensorflow/core/common_runtime/partitioning_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..c282647e7027414b4f925d1d6d93fcc1624dc81a
--- /dev/null
+++ b/tensorflow/core/common_runtime/partitioning_utils.h
@@ -0,0 +1,90 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PARTITIONING_UTILS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PARTITIONING_UTILS_H_
+
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Given a `device_set` and a `graph`, partitions the `graph` into
+// `subgraphs`. `subgraphs` maps device names to the graph assigned to that
+// device. `graph` must have been placed (e.g. by running Placer),
+// i.e. all nodes must have an assigned_device set.
+// `graph` is non-const because the underlying Partition() function transforms
+// the graph to correctly partition distributed control flow.
+Status PartitionFunctionGraph(
+    const DeviceSet& device_set, std::unique_ptr<Graph> graph,
+    std::unordered_map<string, std::unique_ptr<Graph>>* subgraphs);
+
+// Each subgraph produced by partitioning the function body contains a subset
+// of the original `Arg` and `Retval` nodes. This function performs
+// bookkeeping to track which `Arg` and `Retval` nodes were placed on a
+// particular device / subgraph.
+//
+// More specifically, this function
+//  (1) rewrites the indices of the `Arg` and `Retval` nodes placed
+//      on a particular device.  When a function is parittioned each
+//      partition, `subgraph`, get a subset of the arguments and
+//      return values. The `index` attributes of these _Arg and _Retval
+//      nodes reflect the indices of these parameters in the original
+//      function. To convert `subgraph` to a function, we need to replace
+//      there original indices with 0, 1, 2, ... .
+//
+//      The argument and return value order in the partitioned function is
+//      determined by the node iteration order in `subgraph`. This order
+//      is also used in UpdateArgAndRetvalMetadata. This is fine because the
+//      node iteration order is deterministic - it follows the node ids.
+//  (2) records the subsets of `Arg` and `Retval` nodes assigned to the
+//      device in `*_indices`, and
+//  (3) records which `Arg` and `Retval` nodes live in host memory in
+//      `*_alloc_attrs`.
+Status UpdateArgAndRetvalMetadata(
+    Graph* subgraph, std::vector<int>* arg_indices,
+    std::vector<int>* ret_indices,
+    std::vector<AllocatorAttributes>* arg_alloc_attrs,
+    std::vector<AllocatorAttributes>* ret_alloc_attrs);
+
+// Extracts tensors at `indices` from `arguments`.
+std::vector<Tensor> GetArgsForIndices(const std::vector<int>& indices,
+                                      gtl::ArraySlice<Tensor> arguments);
+
+// Utility for generating function names not present in `flib_def`, using
+// given `name` as the base for the name.
+class FunctionNameGenerator {
+ public:
+  // `flib_def` must outlive this.
+  FunctionNameGenerator(const FunctionLibraryDefinition* flib_def,
+                        const string& name)
+      : flib_def_(flib_def), name_(name), counter_(0) {}
+
+  // Returns a function name not present in `flib_def` using `name` as
+  // the base and appending a numeric suffix.
+  string GetName();
+
+ private:
+  const FunctionLibraryDefinition* flib_def_;
+  const string name_;
+  uint32 counter_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PARTITIONING_UTILS_H_
diff --git a/tensorflow/core/common_runtime/partitioning_utils_test.cc b/tensorflow/core/common_runtime/partitioning_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0d4e36222ba7809dae73fb6eaaceda7fd497288a
--- /dev/null
+++ b/tensorflow/core/common_runtime/partitioning_utils_test.cc
@@ -0,0 +1,207 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/partitioning_utils.h"
+
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/function_testlib.h"
+#include "tensorflow/core/common_runtime/placer.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+class PartitioningUtilsTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    SessionOptions options;
+    auto* device_count = options.config.mutable_device_count();
+    device_count->insert({"CPU", 2});
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, "/job:a/replica:0/task:0",
+                                          &devices));
+    device0_ = devices[0].get();
+    device1_ = devices[1].get();
+    device_mgr_.reset(new DeviceMgr(std::move(devices)));
+
+    for (auto d : device_mgr_->ListDevices()) {
+      device_set_.AddDevice(d);
+    }
+  }
+
+  void SwapGraph(Graph* graph, bool assign_device = false) {
+    Scope s = Scope::NewRootScope();
+    if (assign_device) {
+      s = s.WithDevice(device0_->name());
+    }
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto id_x = ops::Identity(s.WithOpName("id_x"), x);
+    auto id_y = ops::Identity(s.WithOpName("id_y"), y);
+    auto dx_retval = ops::_Retval(s.WithOpName("retval1"), id_y, 0);
+    auto dy_retval = ops::_Retval(s.WithOpName("retval2"), id_x, 1);
+    TF_ASSERT_OK(s.ToGraph(graph));
+
+    if (assign_device) {
+      Placer placer(graph, &device_set_, nullptr, /* No session options */
+                    device0_);
+      TF_ASSERT_OK(placer.Run());
+    }
+  }
+
+  void TwoDeviceSwapGraph(Graph* graph) {
+    Scope s = Scope::NewRootScope();
+    Scope s1 = s.WithDevice("/job:a/replica:0/task:0/device:CPU:0");
+    Scope s2 = s.WithDevice("/job:a/replica:0/task:0/device:CPU:1");
+    auto x = ops::_Arg(s1.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s2.WithOpName("y"), DT_FLOAT, 1);
+    auto id_x = ops::Identity(s1.WithOpName("id_x"), x);
+    auto id_y = ops::Identity(s2.WithOpName("id_y"), y);
+    auto dx_retval = ops::_Retval(s2.WithOpName("retval1"), id_y, 0);
+    auto dy_retval = ops::_Retval(s1.WithOpName("retval2"), id_x, 1);
+    TF_ASSERT_OK(s.ToGraph(graph));
+    Placer placer(graph, &device_set_, nullptr, /* No session options */
+                  device0_);
+    TF_ASSERT_OK(placer.Run());
+  }
+
+  // Fills subgraph with an identify function arg->identity->ret
+  // where each node has type `dtype` and arg/ret nodes have
+  // indices `arg_index` and `ret_index`.
+  void SubGraph(Graph* subgraph, DataType dtype, int arg_index, int ret_index) {
+    Scope s = Scope::NewRootScope();
+    Scope s1 = s.WithDevice("/job:a/replica:0/task:0/device:CPU:0");
+    auto x = ops::_Arg(s1.WithOpName("x"), dtype, arg_index);
+    auto id_x = ops::Identity(s1.WithOpName("id_x"), x);
+    auto dx_retval = ops::_Retval(s1.WithOpName("retval1"), id_x, ret_index);
+    TF_ASSERT_OK(s.ToGraph(subgraph));
+    Placer placer(subgraph, &device_set_, nullptr, /* No session options */
+                  device0_);
+    TF_ASSERT_OK(placer.Run());
+  }
+
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  Device* device0_ = nullptr;  // Not owned. (Owned by device_mgr_.)
+  Device* device1_ = nullptr;  // Not owned. (Owned by device_mgr_.)
+  DeviceSet device_set_;
+};
+
+TEST_F(PartitioningUtilsTest, GraphWithoutAssignedDevicesFails) {
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+  SwapGraph(graph.get());
+
+  std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
+  Status status =
+      PartitionFunctionGraph(device_set_, std::move(graph), &subgraphs);
+  ASSERT_TRUE(errors::IsInvalidArgument(status)) << status.ToString();
+}
+
+TEST_F(PartitioningUtilsTest, OneDevice) {
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+  SwapGraph(graph.get(), true);
+  int num_nodes = graph->num_op_nodes();
+
+  std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
+  Status status =
+      PartitionFunctionGraph(device_set_, std::move(graph), &subgraphs);
+  ASSERT_TRUE(status.ok()) << status.ToString();
+
+  ASSERT_EQ(1, subgraphs.size());
+  const auto& pair = *subgraphs.begin();
+  ASSERT_EQ("/job:a/replica:0/task:0/device:CPU:0", pair.first);
+  ASSERT_EQ(num_nodes, pair.second->num_op_nodes());
+}
+
+TEST_F(PartitioningUtilsTest, TwoDevices) {
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+  TwoDeviceSwapGraph(graph.get());
+
+  std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
+  Status status =
+      PartitionFunctionGraph(device_set_, std::move(graph), &subgraphs);
+  ASSERT_TRUE(status.ok()) << status.ToString();
+
+  ASSERT_EQ(2, subgraphs.size());
+
+  const auto& part1 = subgraphs["/job:a/replica:0/task:0/device:CPU:0"];
+  ASSERT_EQ(3, part1->num_op_nodes());
+  const auto& part2 = subgraphs["/job:a/replica:0/task:0/device:CPU:1"];
+  ASSERT_EQ(3, part2->num_op_nodes());
+}
+
+void CheckIndices(const std::vector<int>& expected,
+                  const std::vector<int>& actual) {
+  ASSERT_EQ(expected.size(), actual.size());
+  for (int i = 0; i < expected.size(); ++i) {
+    ASSERT_EQ(expected[i], actual[i]) << " at index " << i;
+  }
+}
+
+void CheckAlloc(const std::vector<bool>& expected,
+                const std::vector<AllocatorAttributes>& actual) {
+  ASSERT_EQ(expected.size(), actual.size());
+  for (int i = 0; i < expected.size(); ++i) {
+    ASSERT_EQ(expected[i], actual[i].on_host()) << " at index " << i;
+  }
+}
+
+void CheckIndex(const Node& node, int expected_index) {
+  const AttrValue* attr_value;
+  TF_ASSERT_OK(node.attrs().Find("index", &attr_value));
+  int index = static_cast<int>(attr_value->i());
+  ASSERT_EQ(expected_index, index);
+}
+
+TEST_F(PartitioningUtilsTest, UpdateArgsAndRets) {
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+  SubGraph(graph.get(), DT_FLOAT, 3, 5);
+
+  std::vector<int> arg_indices;
+  std::vector<int> ret_indices;
+  std::vector<AllocatorAttributes> arg_alloc_attrs;
+  std::vector<AllocatorAttributes> ret_alloc_attrs;
+
+  Status status =
+      UpdateArgAndRetvalMetadata(graph.get(), &arg_indices, &ret_indices,
+                                 &arg_alloc_attrs, &ret_alloc_attrs);
+  ASSERT_TRUE(status.ok()) << status.ToString();
+
+  CheckIndices({3}, arg_indices);
+  CheckIndices({5}, ret_indices);
+  CheckAlloc({false}, arg_alloc_attrs);
+  CheckAlloc({false}, ret_alloc_attrs);
+
+  std::unordered_map<string, Node*> nodes = graph->BuildNodeNameIndex();
+  ASSERT_EQ(1, nodes.count("x"));
+  CheckIndex(*nodes["x"], 0);
+  ASSERT_EQ(1, nodes.count("retval1"));
+  CheckIndex(*nodes["retval1"], 0);
+}
+
+}  // anonymous namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index c43a9d7dc211dd82a1b5771ad22888a2ba275a48..b236343a0f2f7e6190d6649724bdd9495e63b681 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -16,11 +16,27 @@ limitations under the License.
 
 #include <utility>
 
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/common_runtime/partitioning_utils.h"
+#include "tensorflow/core/common_runtime/placer.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/common_runtime/rendezvous_util.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_partition.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/ptr_util.h"
+#include "tensorflow/core/util/reffed_status_callback.h"
 
 namespace tensorflow {
 
@@ -52,13 +68,13 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
       parent_(parent) {
   if (device_mgr == nullptr) {
     flr_map_[nullptr] = NewFunctionLibraryRuntime(
-        nullptr, env, nullptr, graph_def_version, lib_def, default_thread_pool,
+        nullptr, env, nullptr, graph_def_version, lib_def_, default_thread_pool,
         optimizer_options, this);
     return;
   }
   for (Device* d : device_mgr->ListDevices()) {
     flr_map_[d] = NewFunctionLibraryRuntime(
-        device_mgr, env, d, graph_def_version, lib_def, default_thread_pool,
+        device_mgr, env, d, graph_def_version, lib_def_, default_thread_pool,
         optimizer_options, this);
   }
 }
@@ -77,13 +93,13 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
       parent_(parent) {
   if (device_mgr == nullptr) {
     flr_map_[nullptr] = NewFunctionLibraryRuntime(
-        nullptr, env, nullptr, graph_def_version, lib_def, default_thread_pool,
+        nullptr, env, nullptr, graph_def_version, lib_def_, default_thread_pool,
         optimizer_options, std::move(custom_kernel_creator), this);
     return;
   }
   for (Device* d : device_mgr->ListDevices()) {
     flr_map_[d] = NewFunctionLibraryRuntime(
-        device_mgr, env, d, graph_def_version, lib_def, default_thread_pool,
+        device_mgr, env, d, graph_def_version, lib_def_, default_thread_pool,
         optimizer_options, custom_kernel_creator, this);
   }
 }
@@ -126,7 +142,7 @@ void ProcessFunctionLibraryRuntime::ReceiveTensorsAsync(
 }
 
 Status ProcessFunctionLibraryRuntime::GetDeviceIncarnation(
-    const string& device_name, int64* incarnation) {
+    const string& device_name, int64* incarnation) const {
   FunctionLibraryRuntime* flr = GetFLR(device_name);
   if (flr == nullptr) {
     return errors::InvalidArgument("Device name: ", device_name, " not found");
@@ -136,7 +152,7 @@ Status ProcessFunctionLibraryRuntime::GetDeviceIncarnation(
 }
 
 Status ProcessFunctionLibraryRuntime::GetDeviceContext(
-    const string& device_name, DeviceContext** device_context) {
+    const string& device_name, DeviceContext** device_context) const {
   *device_context = nullptr;
   FunctionLibraryRuntime* flr = GetFLR(device_name);
   if (flr == nullptr) {
@@ -181,9 +197,26 @@ FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::AddHandle(
     const string& function_key, const string& device_name,
     FunctionLibraryRuntime::LocalHandle local_handle) {
   mutex_lock l(mu_);
+  return AddHandleLocked(function_key, device_name, local_handle);
+}
+
+FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::AddHandleLocked(
+    const string& function_key, const string& device_name,
+    FunctionLibraryRuntime::LocalHandle local_handle) {
+  auto h = next_handle_;
+  function_data_[h] =
+      MakeUnique<FunctionData>(device_name, local_handle, function_key);
+  table_[function_key] = h;
+  next_handle_++;
+  return h;
+}
+
+FunctionLibraryRuntime::Handle
+ProcessFunctionLibraryRuntime::AddMultiDeviceHandle(
+    std::unique_ptr<MultiDeviceFunctionData> data, const string& function_key) {
+  mutex_lock l(mu_);
   auto h = next_handle_;
-  function_data_[h] = MakeUnique<FunctionData>(
-      device_name, local_handle, function_key);
+  mdevice_data_[h] = std::move(data);
   table_[function_key] = h;
   next_handle_++;
   return h;
@@ -196,14 +229,20 @@ FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::GetHandle(
 }
 
 bool ProcessFunctionLibraryRuntime::IsInstantiatedOnDevice(
-    const string& device_name, FunctionLibraryRuntime::Handle handle) {
+    const string& device_name, FunctionLibraryRuntime::Handle handle) const {
   return GetHandleOnDevice(device_name, handle) != kInvalidHandle;
 }
 
 FunctionLibraryRuntime::LocalHandle
 ProcessFunctionLibraryRuntime::GetHandleOnDevice(
-    const string& device_name, FunctionLibraryRuntime::Handle handle) {
+    const string& device_name, FunctionLibraryRuntime::Handle handle) const {
   tf_shared_lock l(mu_);
+
+  auto miter = mdevice_data_.find(handle);
+  if (miter != mdevice_data_.end()) {
+    return kInvalidLocalHandle;
+  }
+
   auto iter = function_data_.find(handle);
   if (iter == function_data_.end()) {
     return kInvalidLocalHandle;
@@ -216,7 +255,7 @@ ProcessFunctionLibraryRuntime::GetHandleOnDevice(
 }
 
 string ProcessFunctionLibraryRuntime::GetDeviceName(
-    FunctionLibraryRuntime::Handle handle) {
+    FunctionLibraryRuntime::Handle handle) const {
   tf_shared_lock l(mu_);
   auto iter = function_data_.find(handle);
   CHECK(iter != function_data_.end());
@@ -224,10 +263,493 @@ string ProcessFunctionLibraryRuntime::GetDeviceName(
   return function_data->target_device();
 }
 
+ProcessFunctionLibraryRuntime::MultiDeviceFunctionData*
+ProcessFunctionLibraryRuntime::IsMultiDevice(
+    FunctionLibraryRuntime::Handle handle) const {
+  tf_shared_lock l(mu_);
+  const auto& it = mdevice_data_.find(handle);
+  if (it != mdevice_data_.end()) {
+    return it->second.get();
+  }
+  return nullptr;
+}
+
+namespace {
+// Sets `group` to the first colocation group specified in `node`. If no
+// group is specified, does not touch `group`.
+void GetColocationGroup(const Node* node, string* group) {
+  // We hoist the conversion from C-style string literal to string here,
+  // so that we can avoid the many repeated calls to strlen().
+  static const StringPiece kColocationAttrNameStringPiece(kColocationAttrName);
+  const AttrValue* attr_value =
+      node->attrs().Find(kColocationAttrNameStringPiece);
+  if (attr_value != nullptr && attr_value->has_list() &&
+      attr_value->list().s_size() > 0) {
+    *group = attr_value->list().s(0);
+  }
+}
+
+}  // anonymous namespace
+
+Status ProcessFunctionLibraryRuntime::PinArgsAndRets(
+    const std::vector<string>& input_devices,
+    const std::vector<string>& output_devices, const DeviceSet& device_set,
+    Graph* graph) const {
+  // If output_devices are not specified, we want to set the output device
+  // based on the device of the output producing node. The output producing
+  // node can be an arg node because functions can simply return their
+  // arguments. To make sure that the output producing nodes have assigned
+  // devices, we assign them to arguments first.
+  for (Node* node : graph->op_nodes()) {
+    if (node->type_string() == FunctionLibraryDefinition::kArgOp) {
+      const AttrValue* attr_value;
+      TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
+      int64 index = attr_value->i();
+      node->set_assigned_device_name(input_devices[index]);
+    }
+  }
+
+  for (Node* node : graph->op_nodes()) {
+    if (node->type_string() == FunctionLibraryDefinition::kRetOp) {
+      if (output_devices.empty()) {
+        // If output_devices are empty, the node producing retval
+        // must have explicitly assigned device or a colocation constraint
+        // to a node with explicitly assigned device.
+        for (const auto& it : node->in_edges()) {
+          if (!it->IsControlEdge()) {
+            Node* src_node = it->src();
+            const string* src_device = &src_node->requested_device();
+            string colocation_group = "";
+            GetColocationGroup(src_node, &colocation_group);
+            while (src_device->empty() && colocation_group.empty() &&
+                   src_node->IsIdentity()) {
+              src_node = *src_node->in_nodes().begin();
+              src_device = &src_node->requested_device();
+              if (src_device->empty()) {
+                // Some node (e.g. _Args) can have no requested_device,
+                // but have assigned_device.
+                src_device = &src_node->assigned_device_name();
+              }
+
+              GetColocationGroup(src_node, &colocation_group);
+            }
+
+            if (!colocation_group.empty()) {
+              AttrValue::ListValue colo_attr;
+              colo_attr.add_s(colocation_group);
+              std::vector<string> colo_slice = {colocation_group};
+              node->AddAttr(kColocationAttrName, colo_slice);
+            } else if (!src_device->empty()) {
+              // src_device can be a partially specified device. Find the
+              // matching device in the device_set.
+              DeviceNameUtils::ParsedName parsed;
+              if (!DeviceNameUtils::ParseFullName(*src_device, &parsed)) {
+                return errors::InvalidArgument(
+                    "Failed to parse explicit device specification ",
+                    *src_device);
+              }
+              std::vector<Device*> matching_devices;
+              device_set.FindMatchingDevices(parsed, &matching_devices);
+              if (matching_devices.size() != 1) {
+                // Convert a vector of devices to a string.
+                // Using absl::StrJoin did not work in Android builds.
+                string devices = "]";
+                for (Device* device : matching_devices) {
+                  devices.append(device->name());
+                  devices.append(", ");
+                }
+                if (devices.size() > 2) {
+                  devices.resize(devices.size() - 2);
+                }
+                devices.append("]");
+
+                return errors::InvalidArgument(
+                    "When FunctionLibraryRuntime::Options.output_devices are "
+                    "not specified for a multi-device function, the device "
+                    "specification on the output node must match exactly one "
+                    "device. Matched devices are ",
+                    devices);
+              }
+              node->set_assigned_device_name(matching_devices[0]->name());
+            }
+          }
+        }
+      } else {
+        const AttrValue* attr_value;
+        TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
+        int64 index = attr_value->i();
+        // output_devices size is checked in InstantiateMultiDevice
+        DCHECK_GT(output_devices.size(), index);
+        node->set_assigned_device_name(output_devices[index]);
+      }
+    }
+  }
+  return Status::OK();
+}
+
+namespace {
+
+Status ValidateNoListArguments(
+    const protobuf::RepeatedPtrField<OpDef::ArgDef>& args, const char* arg_type,
+    const string& function_name) {
+  for (const OpDef::ArgDef& arg : args) {
+    if (!arg.number_attr().empty() || !arg.type_list_attr().empty()) {
+      return errors::InvalidArgument(
+          "Function ", function_name, " has an ", arg_type, " named \"",
+          arg.name(),
+          "\" that is a list of tensors."
+          " Multi-device functions support only single-tensor inputs "
+          " and outputs");
+    }
+  }
+  return Status::OK();
+}
+
+Status ValidateMultiDeviceOptions(
+    const FunctionDef& fdef,
+    const FunctionLibraryRuntime::InstantiateOptions& options) {
+  const OpDef& signature = fdef.signature();
+  // Multi-device functions don't currently support list inputs or outputs
+  TF_RETURN_IF_ERROR(ValidateNoListArguments(signature.input_arg(), "input",
+                                             signature.name()));
+  TF_RETURN_IF_ERROR(ValidateNoListArguments(signature.output_arg(), "output",
+                                             signature.name()));
+
+  if (fdef.attr().count(FunctionLibraryDefinition::kIntsOnDeviceAttr) != 0 &&
+      fdef.attr().at(FunctionLibraryDefinition::kIntsOnDeviceAttr).b()) {
+    return errors::Unimplemented(
+        "Function '", signature.name(), "' has `",
+        FunctionLibraryDefinition::kIntsOnDeviceAttr,
+        "` attribute set. This attribute is not currently supported by "
+        "multi-device functions.");
+  }
+
+  if (options.input_devices.size() != signature.input_arg_size()) {
+    return errors::InvalidArgument(
+        "InstantiateOptions.input_devices must have the same length "
+        "as the number of arguments: input_devices length = ",
+        options.input_devices.size(),
+        " number of arguments = ", signature.input_arg_size());
+  }
+  if (!options.output_devices.empty() &&
+      options.output_devices.size() != signature.output_arg_size()) {
+    return errors::InvalidArgument(
+        "InstantiateOptions.output_devices must either be empty or have "
+        "the same length as the number of arguments: output_devices length "
+        "= ",
+        options.output_devices.size(),
+        " number of arguments = ", signature.output_arg_size());
+  }
+
+  if (!options.state_handle.empty()) {
+    return errors::Unimplemented(
+        "InstantiateOptions.state_handle is not supported for multi-device "
+        "functions. Function: ",
+        signature.name());
+  }
+  if (options.create_kernels_eagerly) {
+    return errors::Unimplemented(
+        "InstantiateOptions.create_kernels_eagerly is not supported for "
+        "multi-device functions. Function: ",
+        signature.name());
+  }
+
+  return Status::OK();
+}
+
+Status GetGraphAndRets(const string& function_name, AttrSlice attrs,
+                       const FunctionDef* fdef,
+                       const FunctionLibraryDefinition* lib_def,
+                       std::unique_ptr<Graph>* graph,
+                       std::vector<string>* ret_node_names) {
+  auto get_func_sig = [lib_def](const string& op, const OpDef** sig) {
+    return lib_def->LookUpOpDef(op, sig);
+  };
+  FunctionBody* tmp_fbody;
+  // TODO(iga): FunctionDefToBodyHelper copies fdef. Avoid this copy.
+  TF_RETURN_IF_ERROR(
+      FunctionDefToBodyHelper(*fdef, attrs, lib_def, get_func_sig, &tmp_fbody));
+  if (tmp_fbody == nullptr) {
+    LOG(ERROR) << "Failed to get FunctionBody for \"" << function_name << "\"";
+    return errors::Internal("Failed to construct FunctionBody for ",
+                            function_name);
+  }
+  std::unique_ptr<FunctionBody> fbody(tmp_fbody);
+  *graph = std::unique_ptr<Graph>(fbody->graph);
+  fbody->graph = nullptr;
+  ret_node_names->reserve(fbody->ret_nodes.size());
+  for (const Node* node : fbody->ret_nodes) {
+    ret_node_names->push_back(node->name());
+  }
+  return Status::OK();
+}
+
+}  // anonymous namespace
+
+Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
+    const string& function_name, AttrSlice attrs,
+    const FunctionLibraryRuntime::InstantiateOptions& options,
+    FunctionLibraryRuntime::Handle* handle) {
+  // Check if this function has already been instantiated.
+  const string& function_key = Canonicalize(function_name, attrs, options);
+
+  {
+    mutex_lock l(mu_);
+    const auto& it = table_.find(function_key);
+    if (it != table_.end()) {
+      *handle = it->second;
+      ++mdevice_data_[*handle]->instantiation_counter_;
+      return Status::OK();
+    }
+  }
+
+  VLOG(1) << "Instantiating MultiDevice function \"" << function_name
+          << "\" on default device " << options.target;
+
+  const FunctionLibraryDefinition* lib_def =
+      options.overlay_lib == nullptr ? lib_def_ : options.overlay_lib;
+
+  const FunctionDef* fdef = lib_def->Find(function_name);
+  if (fdef == nullptr) {
+    return errors::InvalidArgument("Failed to find function \"", function_name,
+                                   "\" in function library: ", lib_def);
+  }
+
+  TF_RETURN_IF_ERROR(ValidateMultiDeviceOptions(*fdef, options));
+
+  std::unique_ptr<Graph> graph;
+  std::vector<string> ret_node_names;
+
+  TF_RETURN_IF_ERROR(GetGraphAndRets(function_name, attrs, fdef, lib_def,
+                                     &graph, &ret_node_names));
+
+  DeviceSet device_set;
+  for (auto d : device_mgr_->ListDevices()) {
+    device_set.AddDevice(d);
+  }
+
+  TF_RETURN_IF_ERROR(PinArgsAndRets(
+      options.input_devices, options.output_devices, device_set, graph.get()));
+
+  // Make the FunctionLibraryRuntime's device the default device if
+  // nothing else is hard coded. This allows the same function definition
+  // to be specialized to different devices depending on the
+  // PartitionedCallOp's device.
+  FunctionLibraryRuntime* flr = GetFLR(options.target);
+  if (flr == nullptr) {
+    return errors::InvalidArgument(
+        "Cannot instantiate multi-device function with target device ",
+        options.target);
+  }
+
+  std::unique_ptr<MultiDeviceFunctionData> data =
+      MakeUnique<MultiDeviceFunctionData>(function_name, function_key,
+                                          ret_node_names.size(),
+                                          lib_def->ReachableDefinitions(*fdef));
+
+  GraphOptimizationPassOptions optimization_options;
+  // TODO(iga): Thread other relevant options from SessionOptions.
+  SessionOptions session_options;
+  session_options.env = flr->env();
+  optimization_options.session_options = &session_options;
+  optimization_options.graph = &graph;
+  optimization_options.flib_def = &data->overlay_lib_;
+  optimization_options.device_set = &device_set;
+
+  DumpGraph("Before running PRE_PLACEMENT passes", graph.get());
+  TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
+      OptimizationPassRegistry::PRE_PLACEMENT, optimization_options));
+
+  DumpGraph("Before calling Placer", graph.get());
+  Placer placer(graph.get(), &device_set, nullptr, /* No session options */
+                flr->device() /* Default device */);
+  TF_RETURN_IF_ERROR(placer.Run());
+
+  DumpGraph("Before running POST_PLACEMENT passes", graph.get());
+  TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
+      OptimizationPassRegistry::POST_PLACEMENT, optimization_options));
+  DumpGraph("Before running POST_REWRITE_FOR_EXEC passes", graph.get());
+  TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
+      OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, optimization_options));
+  DumpGraph("After all optimization passes", graph.get());
+
+  Device* cpu_device;
+  TF_RETURN_IF_ERROR(device_mgr_->LookupDevice("CPU:0", &cpu_device));
+
+  if (options.optimize_graph_fn) {
+    Status status = options.optimize_graph_fn(std::move(ret_node_names),
+                                              &data->overlay_lib_, device_set,
+                                              cpu_device, &graph);
+    if (!status.ok()) {
+      LOG(WARNING) << "Ignoring multi-device function optimization failure: "
+                   << status.ToString();
+    }
+    DumpGraph("After optimization", graph.get());
+  }
+
+  std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
+  TF_RETURN_IF_ERROR(
+      PartitionFunctionGraph(device_set, std::move(graph), &subgraphs));
+
+  if (options.graph_collector != nullptr) {
+    for (const auto& pair : subgraphs) {
+      GraphDef def;
+      pair.second->ToGraphDef(&def);
+      options.graph_collector->CollectGraph(def);
+    }
+  }
+
+  int i = 0;
+  FunctionNameGenerator name_generator(&data->overlay_lib_, function_name);
+  for (const auto& pair : subgraphs) {
+    i += 1;
+    // TODO(iga): Fail gracefully if the set of devices corresponds
+    // to more than one address space.
+    const string& target = pair.first;
+    Graph* subgraph = pair.second.get();
+
+    ComponentFunctionData* comp_data = &data->glue_[target];
+    TF_RETURN_IF_ERROR(UpdateArgAndRetvalMetadata(
+        subgraph, &comp_data->arg_indices_, &comp_data->ret_indices_,
+        &comp_data->arg_alloc_attrs_, &comp_data->ret_alloc_attrs_));
+    FunctionDef shard;
+    string unique_name = name_generator.GetName();
+    TF_RETURN_IF_ERROR(GraphToFunctionDef(*subgraph, unique_name, &shard));
+    FunctionLibraryRuntime* target_flr = GetFLR(target);
+    TF_RETURN_IF_ERROR(data->overlay_lib_.AddFunctionDef(shard));
+    FunctionLibraryRuntime::InstantiateOptions opts;
+    opts.executor_type = options.executor_type;
+    opts.target = target;
+    opts.overlay_lib = &data->overlay_lib_;
+    FunctionLibraryRuntime::Handle component_handle;
+
+    TF_RETURN_IF_ERROR(target_flr->Instantiate(
+        unique_name, AttrSlice(&shard.attr()), opts, &component_handle));
+    VLOG(1) << "Instantiated component function " << unique_name
+            << " on device " << target << " with component handle "
+            << component_handle;
+    VLOG(2) << DebugString(shard);
+    comp_data->handle_ = component_handle;
+  }
+
+  *handle = AddMultiDeviceHandle(std::move(data), function_key);
+  VLOG(2) << "Instantiated MultiDevice function \"" << function_name
+          << "\" with handle " << *handle;
+  return Status::OK();
+}
+
+Status ProcessFunctionLibraryRuntime::GetOutputDevices(
+    FunctionLibraryRuntime::Handle handle,
+    std::vector<Device*>* output_devices) const {
+  const MultiDeviceFunctionData* data = IsMultiDevice(handle);
+  if (data == nullptr) {
+    return errors::InvalidArgument(
+        "Failed for find multi-device function handle ", handle);
+  }
+
+  for (const auto& pair : data->glue_) {
+    const ComponentFunctionData& comp_data = pair.second;
+    DCHECK(comp_data.ret_alloc_attrs_.size() == comp_data.ret_indices_.size());
+
+    const string& target = pair.first;
+    FunctionLibraryRuntime* target_flr = GetFLR(target);
+    Device* target_device = target_flr->device();
+    const FunctionBody* fbody = target_flr->GetFunctionBody(comp_data.handle_);
+    DCHECK(fbody != nullptr);
+
+    output_devices->resize(data->num_outputs_);
+    for (int j = 0; j < comp_data.ret_indices_.size(); ++j) {
+      int ret_index = comp_data.ret_indices_[j];
+      if (fbody->ret_types[j] == DT_RESOURCE) {
+        (*output_devices)[ret_index] = target_device;
+      } else {
+        (*output_devices)[ret_index] =
+            comp_data.ret_alloc_attrs_[j].on_host() ? nullptr : target_device;
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+void ProcessFunctionLibraryRuntime::RunMultiDevice(
+    const FunctionLibraryRuntime::Options& opts,
+    FunctionLibraryRuntime::Handle handle, gtl::ArraySlice<Tensor> args,
+    std::vector<Tensor>* rets,
+    FunctionLibraryRuntime::DoneCallback done) const {
+  if (opts.create_rendezvous) {
+    // FLR->Run() is the default entry point. It checks for cancellation,
+    // creates rendezvous, etc.
+    // Letting create_rendezvous through will do the wrong thing - each
+    // component function will get a separate rendezvous created by its FLR.
+    done(
+        errors::Internal("Cannot call ProcessFunctionLibraryRuntime::Run with "
+                         "create_rendezvous=true. Please run the function "
+                         "using FunctionLibraryRuntime::Run"));
+    return;
+  }
+
+  const MultiDeviceFunctionData* data = IsMultiDevice(handle);
+  if (data == nullptr) {
+    done(
+        errors::InvalidArgument("Failed for find multi-device function handle ",
+                                handle, ". Was the function instantiated?"));
+    return;
+  }
+
+  if (data->glue_.empty()) {
+    // Trivial case where the function body is empty.
+    done(Status::OK());
+    return;
+  }
+
+  auto* refcounted_done = new ReffedStatusCallback(std::move(done));
+  for (int i = 0; i < data->glue_.size(); ++i) {
+    refcounted_done->Ref();
+  }
+
+  FunctionLibraryRuntime::Options opts_copy = opts;
+  for (const auto& pair : data->glue_) {
+    const string& target = pair.first;
+    const ComponentFunctionData& comp_data = pair.second;
+    FunctionLibraryRuntime::Handle handle = pair.second.handle_;
+    VLOG(1) << "Running function shard on device " << target << " with handle "
+            << handle;
+
+    opts_copy.args_alloc_attrs = comp_data.arg_alloc_attrs_;
+    opts_copy.rets_alloc_attrs = comp_data.ret_alloc_attrs_;
+    opts_copy.remote_execution = false;
+    std::vector<Tensor> comp_args =
+        GetArgsForIndices(comp_data.arg_indices_, args);
+    std::vector<Tensor>* comp_rets = new std::vector<Tensor>;
+    rets->resize(data->num_outputs_);
+    GetFLR(target)->Run(
+        opts_copy, handle, comp_args, comp_rets,
+        [comp_rets, rets, comp_data, refcounted_done](const Status& status) {
+          if (!status.ok()) {
+            LOG(ERROR) << "Component function execution failed: " << status;
+            refcounted_done->UpdateStatus(status);
+          } else {
+            for (int i = 0; i < comp_rets->size(); ++i) {
+              (*rets)[comp_data.ret_indices_[i]] = (*comp_rets)[i];
+            }
+          }
+          delete comp_rets;
+          // refcounted_done is thread-safe
+          refcounted_done->Unref();
+        });
+  }
+  refcounted_done->Unref();
+}
+
 Status ProcessFunctionLibraryRuntime::Instantiate(
     const string& function_name, AttrSlice attrs,
     const FunctionLibraryRuntime::InstantiateOptions& options,
     FunctionLibraryRuntime::Handle* handle) {
+  if (options.is_multi_device_function) {
+    return InstantiateMultiDevice(function_name, attrs, options, handle);
+  }
+
   *handle = kInvalidHandle;
   FunctionLibraryRuntime* flr = GetFLR(options.target);
   if (flr != nullptr) {
@@ -247,11 +769,7 @@ Status ProcessFunctionLibraryRuntime::Instantiate(
     FunctionLibraryRuntime::Handle h =
         gtl::FindWithDefault(table_, function_key, kInvalidHandle);
     if (h == kInvalidHandle || function_data_.count(h) == 0) {
-      h = next_handle_;
-      function_data_[h] = MakeUnique<FunctionData>(
-          options.target, kInvalidHandle, function_key);
-      table_[function_key] = h;
-      next_handle_++;
+      h = AddHandleLocked(function_key, options.target, kInvalidHandle);
     }
     f = function_data_[h].get();
     *handle = h;
@@ -272,8 +790,48 @@ Status ProcessFunctionLibraryRuntime::RemoveHandle(
   return Status::OK();
 }
 
+Status ProcessFunctionLibraryRuntime::ReleaseMultiDeviceHandle(
+    FunctionLibraryRuntime::Handle handle) {
+  std::unique_ptr<MultiDeviceFunctionData> mdata;
+  {
+    mutex_lock l(mu_);
+    auto it = mdevice_data_.find(handle);
+    --it->second->instantiation_counter_;
+    if (it->second->instantiation_counter_ != 0) {
+      return Status::OK();
+    }
+    mdata = std::move(it->second);
+    table_.erase(mdata->function_key_);
+    mdevice_data_.erase(it);
+  }
+
+  // If we are here we are releasing the last instantiation of `handle`.
+  // Release all component function handles.
+  Status overall_status;
+  for (const auto& it : mdata->glue_) {
+    const string& device = it.first;
+    FunctionLibraryRuntime::Handle flr_handle = it.second.handle_;
+    FunctionLibraryRuntime* flr = GetFLR(device);
+    if (flr == nullptr) {
+      return errors::InvalidArgument(
+          "Failed to find FunctionLibraryRuntime for device ", device,
+          " when releasing multi-device function handle ", handle);
+    }
+    Status status = flr->ReleaseHandle(flr_handle);
+    if (!status.ok()) {
+      overall_status = status;
+    }
+  }
+
+  return overall_status;
+}
+
 Status ProcessFunctionLibraryRuntime::ReleaseHandle(
     FunctionLibraryRuntime::Handle handle) {
+  if (IsMultiDevice(handle)) {
+    return ReleaseMultiDeviceHandle(handle);
+  }
+
   FunctionLibraryRuntime* flr = nullptr;
   string target_device;
   {
@@ -291,12 +849,15 @@ Status ProcessFunctionLibraryRuntime::ReleaseHandle(
 void ProcessFunctionLibraryRuntime::Run(
     const FunctionLibraryRuntime::Options& opts,
     FunctionLibraryRuntime::Handle handle, gtl::ArraySlice<Tensor> args,
-    std::vector<Tensor>* rets, FunctionLibraryRuntime::DoneCallback done) {
-  if (!opts.remote_execution) {
-    done(errors::InvalidArgument(
-        "ProcessFunctionLibraryRuntime::Run should only be called when there ",
-        "is a remote execution."));
-    return;
+    std::vector<Tensor>* rets,
+    FunctionLibraryRuntime::DoneCallback done) const {
+  bool multi_device;
+  {
+    tf_shared_lock l(mu_);
+    multi_device = mdevice_data_.find(handle) != mdevice_data_.end();
+  }
+  if (multi_device) {
+    return RunMultiDevice(opts, handle, args, rets, done);
   }
 
   FunctionLibraryRuntime* flr = nullptr;
@@ -313,6 +874,15 @@ void ProcessFunctionLibraryRuntime::Run(
     target_device = function_data->target_device();
     local_handle = function_data->local_handle();
   }
+
+  if (!opts.remote_execution) {
+    done(
+        errors::InvalidArgument("ProcessFunctionLibraryRuntime::Run should "
+                                "only be called for multi-device functions or "
+                                "for remote execution."));
+    return;
+  }
+
   flr = GetFLR(target_device);
   if (flr != nullptr) {
     auto rendezvous = opts.rendezvous;
@@ -374,7 +944,7 @@ Status ProcessFunctionLibraryRuntime::Clone(
     Env* env, int graph_def_version, const OptimizerOptions& optimizer_options,
     CustomKernelCreator custom_kernel_creator,
     std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
-    std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr) {
+    std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr) const {
   out_lib_def->reset(new FunctionLibraryDefinition(*lib_def_));
   out_pflr->reset(new ProcessFunctionLibraryRuntime(
       device_mgr_, env, graph_def_version, out_lib_def->get(),
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index 53815715d8b9d033f5600320108cb443c36b3e93..a08e84510737190c628775f6a8002a1190056207 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/protobuf/config.pb.h"
@@ -79,7 +80,8 @@ class ProcessFunctionLibraryRuntime {
   FunctionLibraryRuntime* GetFLR(const string& device_name) const;
 
   // Returns the device incarnation for the given device_name.
-  Status GetDeviceIncarnation(const string& device_name, int64* incarnation);
+  Status GetDeviceIncarnation(const string& device_name,
+                              int64* incarnation) const;
 
   // For a given canonicalized key signature of the function instantiated
   // on device `device_name` and a `local_handle`, creates a handle and returns
@@ -94,14 +96,23 @@ class ProcessFunctionLibraryRuntime {
 
   // For the given handle instantiated on device `device_name` returns the local
   // index of instantiation of that function. If the function was not
-  // instantiated on `device_name` returns kInvalidLocalHandle.
+  // instantiated on `device_name` or the function is multi-device,
+  // returns kInvalidLocalHandle.
   FunctionLibraryRuntime::LocalHandle GetHandleOnDevice(
-      const string& device_name, FunctionLibraryRuntime::Handle handle);
+      const string& device_name, FunctionLibraryRuntime::Handle handle) const;
+
+  // Fills `output_devices` with the devices on which the results will
+  // be produced. If some output is produced on CPU, the corresponding Device*
+  // is set to nullptr. If some output is DT_RESOURCE, the corresponding Device*
+  // is set to the device backing the resource.
+  // REQUIRES: `handle` identifies a multi-device function.
+  Status GetOutputDevices(FunctionLibraryRuntime::Handle handle,
+                          std::vector<Device*>* output_devices) const;
 
   // Returns true if function with handle `handle` was instantiated on device
-  // `device_name`.
+  // `device_name`. Returns false for multi-device functions.
   bool IsInstantiatedOnDevice(const string& device_name,
-                              FunctionLibraryRuntime::Handle handle);
+                              FunctionLibraryRuntime::Handle handle) const;
 
   // Instantiates the function. See framework/function.h for more details.
   // Allows for function_name to be instantiated on different devices
@@ -114,6 +125,9 @@ class ProcessFunctionLibraryRuntime {
   // tells it to release it. If the `handle` isnt' needed at all, the local FLR
   // might call RemoveHandle on this to get rid of the state owned by the Proc
   // FLR.
+  // For multi-device functions, calls ReleaseHandle on local FLRs for each
+  // component function that is part of this multi-device function.
+  // Each local FLR might call RemoveHandle on this.
   Status ReleaseHandle(FunctionLibraryRuntime::Handle handle);
 
   // Runs the function with given `handle`. Function could have been
@@ -121,17 +135,78 @@ class ProcessFunctionLibraryRuntime {
   void Run(const FunctionLibraryRuntime::Options& opts,
            FunctionLibraryRuntime::Handle handle, gtl::ArraySlice<Tensor> args,
            std::vector<Tensor>* rets,
-           FunctionLibraryRuntime::DoneCallback done);
+           FunctionLibraryRuntime::DoneCallback done) const;
 
  private:
+  friend class FunctionLibraryRuntimeImpl;
+
+  using DeviceAndFHandle = std::pair<string, FunctionLibraryRuntime::Handle>;
+  using ArgAndRetIndices = std::pair<std::vector<int>, std::vector<int>>;
+  using ArgAndRetAllocAttrs = std::pair<std::vector<AllocatorAttributes>,
+                                        std::vector<AllocatorAttributes>>;
+
+  FunctionLibraryRuntime::Handle AddHandleLocked(
+      const string& function_key, const string& device_name,
+      FunctionLibraryRuntime::LocalHandle local_handle)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Structure to keep track of how a component function (a single-device
+  // piece of a multi-device function) fits into the multi-device function.
+  struct ComponentFunctionData {
+    // The handle for the instantiated component function.
+    FunctionLibraryRuntime::Handle handle_;
+    // arg_indices_.size() is the number of arguments to the component function.
+    // The i'th argument of the component function comes from the
+    // `arg_indices_[i]`th argument of the multi-device function.
+    std::vector<int> arg_indices_;
+    // ret_indices_.size() is the number of return value of the component
+    // function.  The i'th return value of the component function goes to the
+    // `ret_indices_[i]`th return value of the multi-device function.
+    std::vector<int> ret_indices_;
+    // arg_alloc_attrs_[i] are the allocator attributes of the i'th argument to
+    // the component function.
+    std::vector<AllocatorAttributes> arg_alloc_attrs_;
+    // ret_alloc_attrs_[i] are the allocator attributes of the i'th return value
+    // of the component function.
+    std::vector<AllocatorAttributes> ret_alloc_attrs_;
+  };
+
+  // Data structure holding information for a single instantiated multi-device
+  // function.
+  // The fields are filled in during instantiation. Once the object is
+  // added to mdevice_data_, all fields are constant.
+  struct MultiDeviceFunctionData {
+    MultiDeviceFunctionData(const string& function_name,
+                            const string& function_key, int num_outputs,
+                            const FunctionLibraryDefinition& overlay_lib)
+        : num_outputs_(num_outputs),
+          instantiation_counter_(1),
+          function_name_(function_name),
+          function_key_(function_key),
+          overlay_lib_(overlay_lib) {}
+
+    // Stored here to resize the output tensor vector when function is run.
+    const int num_outputs_;
+    uint64 instantiation_counter_;
+    const string function_name_;
+    const string function_key_;
+    // The overlay library holding component function definitions as well as
+    // the definitions of functions they call.
+    FunctionLibraryDefinition overlay_lib_;
+
+    // Maps the device name to the information about the component function
+    // be run on this device.
+    std::unordered_map<string, ComponentFunctionData> glue_;
+  };
+
   // For a given device_name, returns a DeviceContext for copying
   // tensors to/from the device.
   Status GetDeviceContext(const string& device_name,
-                          DeviceContext** device_context);
+                          DeviceContext** device_context) const;
 
   // Looks up the information for the given `handle` and returns the name
   // of the device where the function is registered.
-  string GetDeviceName(FunctionLibraryRuntime::Handle handle);
+  string GetDeviceName(FunctionLibraryRuntime::Handle handle) const;
 
   // Removes handle from the state owned by this object.
   Status RemoveHandle(FunctionLibraryRuntime::Handle handle);
@@ -140,12 +215,39 @@ class ProcessFunctionLibraryRuntime {
                const OptimizerOptions& optimizer_options,
                CustomKernelCreator custom_kernel_creator,
                std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
-               std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr);
-
-  friend class FunctionLibraryRuntimeImpl;
-
-  mutable mutex mu_;
-
+               std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr) const;
+
+  Status ReleaseMultiDeviceHandle(FunctionLibraryRuntime::Handle handle);
+
+  // If handle represents a multi-device function, returns the multi-device
+  // data associated with handle. Else, nullptr.
+  MultiDeviceFunctionData* IsMultiDevice(
+      FunctionLibraryRuntime::Handle handle) const;
+
+  Status InstantiateMultiDevice(
+      const string& function_name, AttrSlice attrs,
+      const FunctionLibraryRuntime::InstantiateOptions& options,
+      FunctionLibraryRuntime::Handle* handle);
+
+  FunctionLibraryRuntime::Handle AddMultiDeviceHandle(
+      const std::unique_ptr<MultiDeviceFunctionData> data,
+      const string& function_key);
+
+  // TODO(iga): Reword
+  // Pins each arg that emits a `DT_RESOURCE` tensor to the device on which the
+  // corresponding resource lives. This ensures that the Placer assigns ops that
+  // access these resources to the appropriate devices.
+  Status PinArgsAndRets(const std::vector<string>& input_devices,
+                        const std::vector<string>& output_devices,
+                        const DeviceSet& device_set, Graph* graph) const;
+
+  void RunMultiDevice(const FunctionLibraryRuntime::Options& opts,
+                      FunctionLibraryRuntime::Handle handle,
+                      gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
+                      FunctionLibraryRuntime::DoneCallback done) const;
+
+  // Data structure holding information for a single instantiated remote
+  // (to be executed on `target_device`) function.
   class FunctionData {
    public:
     FunctionData(const string& target_device,
@@ -181,15 +283,26 @@ class ProcessFunctionLibraryRuntime {
     Notification init_done_;
   };
 
+  mutable mutex mu_;
+
   const DeviceMgr* const device_mgr_;
   const FunctionLibraryDefinition* lib_def_;
   thread::ThreadPool* default_thread_pool_;
-  // Holds all the function invocations here.
+
+  // Holds all the function instantiations. Maps function_keys to handles.
   std::unordered_map<string, FunctionLibraryRuntime::Handle> table_
       GUARDED_BY(mu_);
+
+  // Function data for instantitated remote functions.
   std::unordered_map<FunctionLibraryRuntime::Handle,
                      std::unique_ptr<FunctionData>>
       function_data_ GUARDED_BY(mu_);
+
+  // Function data for instantiated multi-device functions.
+  std::unordered_map<FunctionLibraryRuntime::Handle,
+                     std::unique_ptr<MultiDeviceFunctionData>>
+      mdevice_data_ GUARDED_BY(mu_);
+
   std::unordered_map<Device*, std::unique_ptr<FunctionLibraryRuntime>> flr_map_;
   int next_handle_ GUARDED_BY(mu_);
   DistributedFunctionLibraryRuntime* const parent_;
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index 21cb62118aebafa8a03903296b65f0617510f080..b4d3ac0df304e7caf0b742d018d43c9def2d76e6 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -21,7 +21,9 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/resource_var.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/type_index.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -29,6 +31,11 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
 
+#ifdef GOOGLE_CUDA
+#include "cuda/include/cuda.h"
+#include "cuda/include/cuda_runtime_api.h"
+#endif  // GOOGLE_CUDA
+
 namespace tensorflow {
 namespace {
 
@@ -65,9 +72,18 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
     std::vector<std::unique_ptr<Device>> devices;
     TF_CHECK_OK(DeviceFactory::AddDevices(options, "/job:a/replica:0/task:0",
                                           &devices));
-    device0_ = devices[0].get();
-    device1_ = devices[1].get();
     device_mgr_.reset(new DeviceMgr(std::move(devices)));
+    TF_CHECK_OK(device_mgr_->LookupDevice(
+        "/job:a/replica:0/task:0/device:CPU:0", &device0_));
+    TF_CHECK_OK(device_mgr_->LookupDevice(
+        "/job:a/replica:0/task:0/device:CPU:1", &device1_));
+    // If no GPU is available, gpu_device_ will remain nullptr.
+    Status status = device_mgr_->LookupDevice(
+        "/job:a/replica:0/task:0/device:GPU:0", &gpu_device_);
+    if (!status.ok()) {
+      CHECK_EQ(nullptr, gpu_device_);
+    }
+
     FunctionDefLibrary proto;
     for (const auto& fdef : flib) *(proto.add_function()) = fdef;
     lib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), proto));
@@ -86,6 +102,55 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
     return proc_flr_->Instantiate(name, attrs, instantiate_opts, handle);
   }
 
+  Tensor GPUToCPU(const Tensor& device_tensor) {
+#ifdef GOOGLE_CUDA
+    CHECK(gpu_device_);
+    CHECK(gpu_device_->tensorflow_gpu_device_info() != nullptr);
+    DeviceContext* device_context =
+        gpu_device_->tensorflow_gpu_device_info()->default_context;
+
+    Notification n;
+    Status status;
+    Tensor cpu_tensor(device_tensor.dtype(), device_tensor.shape());
+    device_context->CopyDeviceTensorToCPU(&device_tensor, "", gpu_device_,
+                                          &cpu_tensor,
+                                          [&n, &status](const Status& s) {
+                                            status = s;
+                                            n.Notify();
+                                          });
+    n.WaitForNotification();
+    CHECK(status.ok());
+    return cpu_tensor;
+#else
+    CHECK(false);
+#endif  // GOOGLE_CUDA
+  }
+
+  Tensor CPUToGPU(const Tensor& cpu_tensor) {
+#ifdef GOOGLE_CUDA
+    CHECK(gpu_device_);
+    CHECK(gpu_device_->tensorflow_gpu_device_info() != nullptr);
+    DeviceContext* device_context =
+        gpu_device_->tensorflow_gpu_device_info()->default_context;
+
+    Notification n;
+    Status status;
+    Tensor device_tensor(gpu_device_->GetAllocator({}), cpu_tensor.dtype(),
+                         cpu_tensor.shape(), {});
+    device_context->CopyCPUTensorToDevice(&cpu_tensor, gpu_device_,
+                                          &device_tensor,
+                                          [&n, &status](const Status& s) {
+                                            status = s;
+                                            n.Notify();
+                                          });
+    n.WaitForNotification();
+    CHECK(status.ok());
+    return device_tensor;
+#else
+    CHECK(false);
+#endif  // GOOGLE_CUDA
+  }
+
   Status Run(const string& name, FunctionLibraryRuntime::Options opts,
              test::function::Attrs attrs,
              const FunctionLibraryRuntime::InstantiateOptions& instantiate_opts,
@@ -135,7 +200,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
                      done2.Notify();
                    });
     done2.WaitForNotification();
-    EXPECT_TRUE(errors::IsNotFound(status));
+    EXPECT_TRUE(errors::IsNotFound(status)) << "Actual status: " << status;
     EXPECT_TRUE(str_util::StrContains(status.error_message(), "not found."));
 
     return Status::OK();
@@ -144,6 +209,8 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
   std::unique_ptr<DeviceMgr> device_mgr_;
   Device* device0_ = nullptr;  // Not owned. (Owned by device_mgr_.)
   Device* device1_ = nullptr;  // Not owned. (Owned by device_mgr_.)
+  // Remains as nullptr if no GPU is available.
+  Device* gpu_device_ = nullptr;  // Not owned. (Owned by device_mgr_.)
   std::unique_ptr<FunctionLibraryDefinition> lib_def_;
   std::unique_ptr<TestClusterFLR> cluster_flr_;
   std::unique_ptr<ProcessFunctionLibraryRuntime> proc_flr_;
@@ -345,5 +412,300 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, ClusterFLRParallelTest) {
   rendezvous_->Unref();
 }
 
+bool IsCUDATensor(const Tensor& t) {
+#ifdef GOOGLE_CUDA
+  cudaPointerAttributes attributes;
+  cudaError_t err =
+      cudaPointerGetAttributes(&attributes, t.tensor_data().data());
+  if (err == cudaErrorInvalidValue) return false;
+  CHECK_EQ(cudaSuccess, err) << cudaGetErrorString(err);
+  return (attributes.memoryType == cudaMemoryTypeDevice);
+#else
+  CHECK(false)
+      << "IsCUDATensor should not be called when CUDA is not available";
+#endif  // GOOGLE_CUDA
+}
+
+void TestTwoDeviceMult(
+    ProcessFunctionLibraryRuntimeTest* fixture,
+    const FunctionLibraryRuntime::InstantiateOptions& inst_opts,
+    const string& error = "") {
+  fixture->Init({test::function::TwoDeviceMult()});
+  FunctionLibraryRuntime::Options opts;
+  auto x = test::AsTensor<float>({1, 2, 3});
+  Tensor y_cpu;
+  Tensor y_gpu;
+  Status status = fixture->Run("TwoDeviceMult", opts, {{"T", DT_FLOAT}},
+                               inst_opts, {x}, {&y_cpu, &y_gpu});
+  if (!error.empty()) {
+    EXPECT_TRUE(errors::IsInvalidArgument(status))
+        << "Actual status: " << status;
+    EXPECT_TRUE(str_util::StrContains(status.error_message(), error))
+        << "Actual error message: " << status.error_message();
+    fixture->rendezvous_->Unref();
+    return;
+  }
+
+  EXPECT_TRUE(status.ok()) << "Actual status: " << status;
+  EXPECT_FALSE(IsCUDATensor(y_cpu));
+  test::ExpectTensorEqual<float>(y_cpu, test::AsTensor<float>({2, 4, 6}));
+
+  EXPECT_TRUE(IsCUDATensor(y_gpu));
+  Tensor y_gpu_on_cpu = fixture->GPUToCPU(y_gpu);
+  test::ExpectTensorEqual<float>(y_gpu_on_cpu,
+                                 test::AsTensor<float>({3, 6, 9}));
+  fixture->rendezvous_->Unref();
+}
+
+void TestTwoDeviceInputOutput(
+    ProcessFunctionLibraryRuntimeTest* fixture,
+    const FunctionLibraryRuntime::InstantiateOptions& inst_opts) {
+  if (fixture->gpu_device_ == nullptr) {
+    GTEST_SKIP() << "No GPUs available";
+  }
+  fixture->Init({test::function::TwoDeviceInputOutput()});
+  FunctionLibraryRuntime::Options opts;
+  Tensor x1 = test::AsTensor<float>({1, 2});
+  if (str_util::StrContains(inst_opts.input_devices[0], "GPU")) {
+    x1 = fixture->CPUToGPU(x1);
+  }
+  Tensor x2 = test::AsTensor<float>({10, 20});
+  if (str_util::StrContains(inst_opts.input_devices[1], "GPU")) {
+    x2 = fixture->CPUToGPU(x2);
+  }
+
+  Tensor y1;
+  Tensor y2;
+  TF_CHECK_OK(fixture->Run("TwoDeviceInputOutput", opts, {{"T", DT_FLOAT}},
+                           inst_opts, {x1, x2}, {&y1, &y2}));
+
+  if (str_util::StrContains(inst_opts.output_devices[0], "GPU")) {
+    EXPECT_TRUE(IsCUDATensor(y1));
+    y1 = fixture->GPUToCPU(y1);
+  } else {
+    EXPECT_FALSE(IsCUDATensor(y1));
+  }
+  test::ExpectTensorEqual<float>(y1, test::AsTensor<float>({2, 4}));
+
+  if (str_util::StrContains(inst_opts.output_devices[1], "GPU")) {
+    EXPECT_TRUE(IsCUDATensor(y2));
+    y2 = fixture->GPUToCPU(y2);
+  } else {
+    EXPECT_FALSE(IsCUDATensor(y2));
+  }
+  test::ExpectTensorEqual<float>(y2, test::AsTensor<float>({30, 60}));
+
+  fixture->rendezvous_->Unref();
+}
+
+std::vector<string> CompleteDevices(const std::vector<string>& v) {
+  std::vector<string> result;
+  result.reserve(v.size());
+  for (const string& s : v) {
+    result.push_back(strings::StrCat("/job:a/replica:0/task:0/device:", s));
+  }
+  return result;
+}
+
+FunctionLibraryRuntime::InstantiateOptions MakeOptions(
+    const string& target, const std::vector<string>& input_devices,
+    const std::vector<string>& output_devices) {
+  FunctionLibraryRuntime::InstantiateOptions inst_opts;
+  inst_opts.target = target;
+  inst_opts.input_devices = CompleteDevices(input_devices);
+  inst_opts.output_devices = CompleteDevices(output_devices);
+  inst_opts.is_multi_device_function = true;
+  return inst_opts;
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ExplicitOutputDevice) {
+  if (gpu_device_ == nullptr) {
+    GTEST_SKIP() << "No GPUs available";
+  }
+  TestTwoDeviceMult(this, MakeOptions("CPU:0", {"CPU:0"}, {"CPU:0", "GPU:0"}));
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_InferredOutputDevice) {
+  if (gpu_device_ == nullptr) {
+    GTEST_SKIP() << "No GPUs available";
+  }
+  TestTwoDeviceMult(this, MakeOptions("CPU:0", {"CPU:0"}, {}));
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ErrorWhenNoInputDevices) {
+  if (gpu_device_ == nullptr) {
+    GTEST_SKIP() << "No GPUs available";
+  }
+  TestTwoDeviceMult(this, MakeOptions("CPU:0", {}, {}),
+                    "input_devices must have the same length");
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest,
+       MultiDevice_ErrorWhenTooManyInputDevices) {
+  if (gpu_device_ == nullptr) {
+    GTEST_SKIP() << "No GPUs available";
+  }
+  TestTwoDeviceMult(this, MakeOptions("CPU:0", {"CPU:0", "CPU:1"}, {}),
+                    "input_devices must have the same length");
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest,
+       MultiDevice_ErrorWhenTooManyOutputDevices) {
+  TestTwoDeviceMult(
+      this, MakeOptions("CPU:0", {"CPU:0"}, {"CPU:0", "GPU:0", "CPU:1"}),
+      "output_devices must either be empty or have the same length");
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest,
+       MultiDevice_ErrorWhenBadTargetDevice) {
+  TestTwoDeviceMult(
+      this, MakeOptions("GPU:11", {"CPU:0"}, {"CPU:0", "GPU:0"}),
+      "Cannot instantiate multi-device function with target device GPU:11");
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ErrorWhenListInput) {
+  const FunctionDef& def = test::function::FuncWithListInput();
+  Init({def});
+  FunctionLibraryRuntime::Handle handle;
+  Status status = proc_flr_->Instantiate(
+      "FuncWithListInput", test::function::Attrs({{"T", DT_FLOAT}, {"N", 1}}),
+      MakeOptions("CPU:0", {"CPU:0"}, {}), &handle);
+  ASSERT_TRUE(errors::IsInvalidArgument(status)) << "Actual status: " << status;
+  ASSERT_TRUE(str_util::StrContains(
+      status.error_message(),
+      "FuncWithListInput has an input named \"x1\" that is a list of tensors"))
+      << "Actual error message: " << status.error_message();
+  rendezvous_->Unref();
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ErrorWhenListOutput) {
+  const FunctionDef& def = test::function::FuncWithListOutput();
+  Init({def});
+  FunctionLibraryRuntime::Handle handle;
+  Status status = proc_flr_->Instantiate(
+      "FuncWithListOutput", test::function::Attrs({{"T", DT_FLOAT}, {"N", 1}}),
+      MakeOptions("CPU:0", {}, {"CPU:0"}), &handle);
+  ASSERT_TRUE(errors::IsInvalidArgument(status)) << "Actual status: " << status;
+  ASSERT_TRUE(str_util::StrContains(
+      status.error_message(),
+      "FuncWithListOutput has an output named \"y\" that is a list of tensors"))
+      << "Actual error message: " << status.error_message();
+  rendezvous_->Unref();
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest,
+       MultiDevice_ExplicitMultiInputOutput) {
+  TestTwoDeviceInputOutput(
+      this, MakeOptions("CPU:0", {"CPU:0", "GPU:0"}, {"CPU:0", "GPU:0"}));
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_FlipInputs) {
+  TestTwoDeviceInputOutput(
+      this, MakeOptions("CPU:0", {"GPU:0", "CPU:0"}, {"CPU:0", "GPU:0"}));
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_FlipOutputs) {
+  TestTwoDeviceInputOutput(
+      this, MakeOptions("CPU:0", {"CPU:0", "GPU:0"}, {"GPU:0", "CPU:0"}));
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_FlipBoth) {
+  TestTwoDeviceInputOutput(
+      this, MakeOptions("CPU:0", {"GPU:0", "CPU:0"}, {"GPU:0", "CPU:0"}));
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_EmptyBodySwap) {
+  if (gpu_device_ == nullptr) {
+    GTEST_SKIP() << "No GPUs available";
+  }
+  FunctionLibraryRuntime::InstantiateOptions inst_opts =
+      MakeOptions("CPU:0", {"GPU:0", "CPU:0"}, {"CPU:0", "GPU:0"});
+  Init({test::function::EmptyBodySwap()});
+
+  Tensor x1 = CPUToGPU(test::AsTensor<float>({1, 2}));
+  Tensor x2 = test::AsTensor<float>({10, 20});
+  Tensor y1;
+  Tensor y2;
+  TF_CHECK_OK(Run("EmptyBodySwap", {}, {{"T", DT_FLOAT}}, inst_opts, {x1, x2},
+                  {&y1, &y2}));
+
+  EXPECT_FALSE(IsCUDATensor(y1));
+  test::ExpectTensorEqual<float>(y1, test::AsTensor<float>({10, 20}));
+
+  EXPECT_TRUE(IsCUDATensor(y2));
+  y2 = GPUToCPU(y2);
+  test::ExpectTensorEqual<float>(y2, test::AsTensor<float>({1, 2}));
+
+  rendezvous_->Unref();
+}
+
+Tensor GetResourceHandle(const string& var_name, const string& container,
+                         const string& device_name) {
+  ResourceHandle handle;
+  handle.set_device(device_name);
+  handle.set_container(container);
+  handle.set_name(var_name);
+  handle.set_hash_code(MakeTypeIndex<Var>().hash_code());
+  handle.set_maybe_type_name(MakeTypeIndex<Var>().name());
+  Tensor tensor(DT_RESOURCE, TensorShape({}));
+  tensor.scalar<ResourceHandle>()() = handle;
+  return tensor;
+}
+
+void TestResourceOutputAndUse(ProcessFunctionLibraryRuntimeTest* fixture,
+                              const string& resource_return_device) {
+  if (fixture->gpu_device_ == nullptr) {
+    GTEST_SKIP() << "No GPUs available";
+  }
+  FunctionLibraryRuntime::InstantiateOptions inst_opts = MakeOptions(
+      "CPU:0", {"GPU:0", "GPU:0"}, {resource_return_device, "GPU:0"});
+  fixture->Init({test::function::ResourceOutput(),
+                 test::function::ReadResourceVariable()});
+
+  // Make resource var
+  Tensor resource_value = fixture->CPUToGPU(test::AsTensor<float>({10, 20}));
+  Var* resource = new Var(DT_FLOAT);
+  *resource->tensor() = resource_value;
+  resource->is_initialized = true;
+  ResourceMgr* mgr = fixture->gpu_device_->resource_manager();
+  Status status = mgr->Create(mgr->default_container(), "my_gpu_var", resource);
+  ASSERT_TRUE(status.ok()) << status.error_message();
+
+  // Run the function taking a resource and outputing it
+  Tensor x1 = fixture->CPUToGPU(test::AsTensor<float>({1, 2}));
+  Tensor x2 = GetResourceHandle("my_gpu_var", mgr->default_container(),
+                                "/job:a/replica:0/task:0/device:GPU:0");
+  Tensor returned_handle;
+  Tensor y2;
+  TF_CHECK_OK(fixture->Run("ResourceOutput", {}, {{"T", DT_FLOAT}}, inst_opts,
+                           {x1, x2}, {&returned_handle, &y2}));
+
+  EXPECT_FALSE(IsCUDATensor(returned_handle));
+  EXPECT_TRUE(IsCUDATensor(y2));
+  y2 = fixture->GPUToCPU(y2);
+  test::ExpectTensorEqual<float>(y2, test::AsTensor<float>({2, 4}));
+
+  // Read the variable using the handle returned from previous function to
+  // make sure the handle and read value is on the right device.
+  inst_opts = MakeOptions("GPU:0", {"GPU:0"}, {"GPU:0"});
+  Tensor read_resource;
+  TF_CHECK_OK(fixture->Run("ReadResourceVariable", {}, {{"T", DT_FLOAT}},
+                           inst_opts, {returned_handle}, {&read_resource}));
+  EXPECT_TRUE(IsCUDATensor(read_resource));
+  read_resource = fixture->GPUToCPU(read_resource);
+  test::ExpectTensorEqual<float>(read_resource,
+                                 test::AsTensor<float>({10, 20}));
+
+  fixture->rendezvous_->Unref();
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ResourceOutput_GPU) {
+  TestResourceOutputAndUse(this, "GPU:0");
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ResourceOutput_CPU) {
+  TestResourceOutputAndUse(this, "CPU:0");
+}
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index bf2d902af41c690be25a170da6fc22a4902e2d50..3d3711ffc58acc472ddff4b3e497135c4d46bdcf 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -37,6 +37,11 @@ Status GetWindowedOutputSizeVerboseV2(int64 input_size, int64 filter_size,
       *output_size = (input_size - effective_filter_size + stride) / stride;
       *padding_before = *padding_after = 0;
       break;
+    case Padding::EXPLICIT:
+      *output_size = (input_size + *padding_before + *padding_after -
+                      effective_filter_size + stride) /
+                     stride;
+      break;
     case Padding::SAME:
       *output_size = (input_size + stride - 1) / stride;
       const int64 padding_needed =
@@ -71,6 +76,11 @@ Status GetWindowedOutputSizeVerbose(int64 input_size, int64 filter_size,
 Status GetWindowedOutputSize(int64 input_size, int64 filter_size, int64 stride,
                              Padding padding_type, int64* output_size,
                              int64* padding_size) {
+  if (padding_type == Padding::EXPLICIT) {
+    return errors::Internal(
+        "GetWindowedOutputSize does not handle EXPLICIT padding; call "
+        "GetWindowedOutputSizeVerbose instead");
+  }
   int64 padding_after_unused;
   return GetWindowedOutputSizeVerbose(input_size, filter_size, stride,
                                       padding_type, output_size, padding_size,
@@ -81,6 +91,11 @@ Status GetWindowedOutputSizeV2(int64 input_size, int64 filter_size,
                                int64 dilation_rate, int64 stride,
                                Padding padding_type, int64* output_size,
                                int64* padding_size) {
+  if (padding_type == Padding::EXPLICIT) {
+    return errors::Internal(
+        "GetWindowedOutputSizeV2 does not handle EXPLICIT padding; call "
+        "GetWindowedOutputSizeVerboseV2 instead");
+  }
   int64 padding_after_unused;
   return GetWindowedOutputSizeVerboseV2(input_size, filter_size, dilation_rate,
                                         stride, padding_type, output_size,
@@ -123,8 +138,8 @@ Status GetWindowedOutputSizeFromDimsV2(
     shape_inference::InferenceContext* c,
     shape_inference::DimensionHandle input_size,
     shape_inference::DimensionOrConstant filter_size, int64 dilation_rate,
-    int64 stride, Padding padding_type,
-    shape_inference::DimensionHandle* output_size) {
+    int64 stride, Padding padding_type, int64 padding_before,
+    int64 padding_after, shape_inference::DimensionHandle* output_size) {
   if (stride <= 0) {
     return errors::InvalidArgument("Stride must be > 0, but got ", stride);
   }
@@ -137,6 +152,11 @@ Status GetWindowedOutputSizeFromDimsV2(
   // See also the parallel implementation in GetWindowedOutputSizeVerbose.
   switch (padding_type) {
     case Padding::VALID:
+      padding_before = padding_after = 0;
+      TF_FALLTHROUGH_INTENDED;
+    case Padding::EXPLICIT:
+      TF_RETURN_IF_ERROR(
+          c->Add(input_size, padding_before + padding_after, &input_size));
       if (dilation_rate > 1) {
         DimensionHandle window_size;
         TF_RETURN_IF_ERROR(
@@ -166,9 +186,18 @@ Status GetWindowedOutputSizeFromDims(
     shape_inference::DimensionHandle input_size,
     shape_inference::DimensionOrConstant filter_size, int64 stride,
     Padding padding_type, shape_inference::DimensionHandle* output_size) {
+  if (padding_type == Padding::EXPLICIT) {
+    return errors::Internal(
+        "GetWindowedOutputSizeFromDims does not handle EXPLICIT padding; call "
+        "GetWindowedOutputSizeFromDimsV2 instead");
+  }
   return GetWindowedOutputSizeFromDimsV2(c, input_size, filter_size,
                                          /*dilation_rate=*/1, stride,
-                                         padding_type, output_size);
+                                         padding_type,
+                                         // Give dummy values of -1 to
+                                         // padding_before and padding_after,
+                                         // since explicit padding is not used.
+                                         -1, -1, output_size);
 }
 
 Status UnchangedShape(shape_inference::InferenceContext* c) {
@@ -371,7 +400,10 @@ Status ShapeFromDimensions(DimensionHandle batch_dim,
   return tensorflow::Status::OK();
 }
 
-Status Conv2DShape(shape_inference::InferenceContext* c) {
+namespace {
+
+Status Conv2DShapeImpl(shape_inference::InferenceContext* c,
+                       bool supports_explicit_padding) {
   string data_format_str, filter_format_str;
   if (!c->GetAttr("data_format", &data_format_str).ok()) {
     data_format_str = "NHWC";
@@ -464,13 +496,30 @@ Status Conv2DShape(shape_inference::InferenceContext* c) {
   Padding padding;
   TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
 
+  std::vector<int64> explicit_paddings;
+  if (supports_explicit_padding) {
+    TF_RETURN_IF_ERROR(c->GetAttr("explicit_paddings", &explicit_paddings));
+    TF_RETURN_IF_ERROR(CheckValidPadding(padding, explicit_paddings,
+                                         /*num_dims=*/4, data_format));
+  } else {
+    DCHECK(padding != Padding::EXPLICIT);
+  }
+
   DimensionHandle output_rows, output_cols;
+  int64 pad_rows_before = -1, pad_rows_after = -1;
+  int64 pad_cols_before = -1, pad_cols_after = -1;
+  if (padding == Padding::EXPLICIT) {
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'H',
+                             &pad_rows_before, &pad_rows_after);
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'W',
+                             &pad_cols_before, &pad_cols_after);
+  }
   TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
       c, input_spatial_dims[0], filter_rows_dim, dilation_rows, stride_rows,
-      padding, &output_rows));
+      padding, pad_rows_before, pad_rows_after, &output_rows));
   TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
       c, input_spatial_dims[1], filter_cols_dim, dilation_cols, stride_cols,
-      padding, &output_cols));
+      padding, pad_cols_before, pad_cols_after, &output_cols));
 
   ShapeHandle output_shape;
   TF_RETURN_IF_ERROR(
@@ -480,6 +529,19 @@ Status Conv2DShape(shape_inference::InferenceContext* c) {
   return Status::OK();
 }
 
+}  // namespace
+
+// Shape function for Conv2D-like operations that support explicit padding.
+Status Conv2DShapeWithExplicitPadding(shape_inference::InferenceContext* c) {
+  return Conv2DShapeImpl(c, true);
+}
+
+// Shape function for Conv2D-like operations that do not support explicit
+// padding.
+Status Conv2DShape(shape_inference::InferenceContext* c) {
+  return Conv2DShapeImpl(c, false);
+}
+
 // TODO(mjanusz): Unify all conv/pooling shape functions.
 Status Conv3DShape(shape_inference::InferenceContext* c) {
   ShapeHandle input_shape;
@@ -551,13 +613,13 @@ Status Conv3DShape(shape_inference::InferenceContext* c) {
 
   TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
       c, in_planes_dim, filter_planes_dim, dilation_planes, stride_planes,
-      padding, &output_planes));
+      padding, -1, -1, &output_planes));
   TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
-      c, in_rows_dim, filter_rows_dim, dilation_rows, stride_rows, padding,
-      &output_rows));
+      c, in_rows_dim, filter_rows_dim, dilation_rows, stride_rows, padding, -1,
+      -1, &output_rows));
   TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
-      c, in_cols_dim, filter_cols_dim, dilation_cols, stride_cols, padding,
-      &output_cols));
+      c, in_cols_dim, filter_cols_dim, dilation_cols, stride_cols, padding, -1,
+      -1, &output_cols));
 
   ShapeHandle output_shape;
   if (data_format == "NCDHW") {
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index 362899b947b1fd479d227ac5421a5f458405f3c6..14b9688bdc5d41e8cb2e92b1f1a8640fb9687d8c 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -38,11 +38,12 @@ namespace tensorflow {
 //
 // Padding (P): the padding we apply to the input tensor along each
 // dimension. This is usually used to make sure that the spatial dimensions
-// do not shrink when we progress with convolutions. Two types of padding are
-// often used:
+// do not shrink when we progress with convolutions. This function supports two
+// types of padding.
 //   SAME: the pad value is computed so that the output will have size H/S.
 //   VALID: no padding is carried out.
-// The padded area is zero-filled.
+// If you want to use EXPLICIT padding, GetWindowedOutputSizeVerbose must be
+// called instead. Note the padded area is zero-filled.
 //
 // The output dimensions for convolution and many other operations, when given
 // all the parameters above, are as follows:
@@ -95,6 +96,9 @@ Status GetWindowedOutputSize(int64 input_size, int64 filter_size, int64 stride,
 //   When the stride is 1, the expression simplifies to
 //     H' = H-K'+1.
 //
+// If you want to use EXPLICIT padding, GetWindowedOutputSizeVerboseV2 must be
+// called instead
+//
 // TODO(b/67112639): Merge V2 versions and the original versions eventually.
 Status GetWindowedOutputSizeV2(int64 input_size, int64 filter_size,
                                int64 dilation_rate, int64 stride,
@@ -102,9 +106,12 @@ Status GetWindowedOutputSizeV2(int64 input_size, int64 filter_size,
                                int64* padding_size);
 
 // Returns the same output dimensions as in GetWindowedOutputSize, but returns
-// verbose padding dimensions (before/after). Any excess padding
-// (caused by an odd padding size value) is added to the 'padding_after'
-// dimension.
+// verbose padding dimensions (before/after), and EXPLICIT padding is supported.
+// When padding_type is EXPLICIT, *padding_before and *padding_after must
+// already point to initialized integers with the padding amounts. Otherwise,
+// *padding_before and *padding_after are set by this function, and any
+// excess padding (caused by an odd padding size value) is added to the
+// 'padding_after' dimension.
 Status GetWindowedOutputSizeVerbose(int64 input_size, int64 filter_size,
                                     int64 stride, Padding padding_type,
                                     int64* output_size, int64* padding_before,
@@ -122,7 +129,8 @@ Status GetWindowedOutputSizeVerboseV2(int64 input_size, int64 filter_size,
 // of the output tensor and padding to be applied to the input tensor at the
 // lower end of every dimension. Use for 3D convolutions, where the input data
 // is padded with zeros, as well as for 3D avg/max pooling, where the input data
-// is padded with invalid values that are not considered for pooling.
+// is padded with invalid values that are not considered for pooling. EXPLICIT
+// padding is not supported.
 Status Get3dOutputSize(const std::array<int64, 3>& input,
                        const std::array<int64, 3>& window,
                        const std::array<int64, 3>& strides,
@@ -140,21 +148,23 @@ Status Get3dOutputSizeV2(const std::array<int64, 3>& input,
 
 namespace shape_inference {
 
-// Like GetWindowedOutputSize, but deals with DimensionHandles.
+// Like GetWindowedOutputSize, but deals with DimensionHandles. Does not support
+// EXPLICIT padding.
 Status GetWindowedOutputSizeFromDims(InferenceContext* c,
                                      DimensionHandle input_size,
                                      DimensionOrConstant filter_size,
                                      int64 stride, Padding padding_type,
                                      DimensionHandle* output_size);
 
-// The V2 version computes the same outputs with arbitrary dilation_rate. For
-// detailed equations, refer to the comments for GetWindowedOutputSizeV2().
-Status GetWindowedOutputSizeFromDimsV2(InferenceContext* c,
-                                       DimensionHandle input_size,
-                                       DimensionOrConstant filter_size,
-                                       int64 dilation_rate, int64 stride,
-                                       Padding padding_type,
-                                       DimensionHandle* output_size);
+// The V2 version computes the same outputs with arbitrary dilation_rate, and
+// supports EXPLICIT padding. For detailed equations, refer to the comments
+// for GetWindowedOutputSizeV2(). The 'padding_before' and 'padding_after'
+// parameters are only used if padding_type == EXPLICIT.
+Status GetWindowedOutputSizeFromDimsV2(
+    InferenceContext* c, DimensionHandle input_size,
+    DimensionOrConstant filter_size, int64 dilation_rate, int64 stride,
+    Padding padding_type, int64 padding_before, int64 padding_after,
+    DimensionHandle* output_size);
 
 // Transfers shape of input(0) to output(0).
 Status UnchangedShape(shape_inference::InferenceContext* c);
@@ -222,7 +232,11 @@ Status BiasAddShape(shape_inference::InferenceContext* c);
 // Shape function for BiasAddGrad-like operations.
 Status BiasAddGradShape(shape_inference::InferenceContext* c);
 
-// Shape function for Conv2D-like operations.
+// Shape function for Conv2D-like operations that support explicit padding.
+Status Conv2DShapeWithExplicitPadding(shape_inference::InferenceContext* c);
+
+// Shape function for Conv2D-like operations that do not support explicit
+// padding.
 Status Conv2DShape(shape_inference::InferenceContext* c);
 
 // Shape function for Conv3D-like operations.
diff --git a/tensorflow/core/framework/common_shape_fns_test.cc b/tensorflow/core/framework/common_shape_fns_test.cc
index 7c395679d304ffab1dfeff6804eede0d09b63734..b94925c04ee2794033b072a1bc62cf841081a769 100644
--- a/tensorflow/core/framework/common_shape_fns_test.cc
+++ b/tensorflow/core/framework/common_shape_fns_test.cc
@@ -408,12 +408,14 @@ TEST(CommonShapeFnsTest, BiasAddGradShapeTest) {
 TEST(CommonShapeFnsTest, Conv2DShapeTest) {
   ShapeInferenceTestOp op("Conv2D");
   auto set_op = [&op](const std::vector<int32>& strides, const string& padding,
-                      const string& data_format, const string& filter_format) {
+                      const string& data_format, const string& filter_format,
+                      const std::vector<int32>& explicit_paddings = {}) {
     TF_CHECK_OK(NodeDefBuilder("test", "Conv2D")
                     .Input("input", 0, DT_FLOAT)
                     .Input("filter", 0, DT_FLOAT)
                     .Attr("strides", strides)
                     .Attr("padding", padding)
+                    .Attr("explicit_paddings", explicit_paddings)
                     .Attr("data_format", data_format)
                     .Attr("filter_format", filter_format)
                     .Finalize(&op.node_def));
@@ -536,19 +538,73 @@ TEST(CommonShapeFnsTest, Conv2DShapeTest) {
   INFER_OK(op, "[1,?,4,1];[?,?,?,?]", "[d0_0,?,2,d1_3]");
   INFER_OK(op, "[1,4,?,1];[?,?,?,?]", "[d0_0,2,?,d1_3]");
   INFER_OK(op, "[1,4,4,?];[?,?,?,?]", "[d0_0,2,2,d1_3]");
+
+  // Some tests for "EXPLICIT" padding
+
+  // 4x4 input, 1x1 filter, 1x1 stride, [0, 2, 1, 4] padding
+  set_op({{1, 1, 1, 1}}, "EXPLICIT", "NHWC", "HWIO", {0, 0, 0, 2, 1, 4, 0, 0});
+  INFER_OK(op, "[1,4,4,1];[1,1,1,1]", "[d0_0,6,9,d1_3]");
+
+  // 3x3 input, 2x2 filter, 1x1 stride, [1, 0, 1, 2] padding
+  set_op({{1, 1, 1, 1}}, "EXPLICIT", "NHWC", "HWIO", {0, 0, 1, 0, 1, 2, 0, 0});
+  INFER_OK(op, "[1,3,3,1];[2,2,1,1]", "[d0_0,3,5,d1_3]");
+
+  // 4x4 input, 2x2 filter, 2x2 stride, [3, 2, 1, 0] padding
+  set_op({{1, 2, 2, 1}}, "EXPLICIT", "NHWC", "HWIO", {0, 0, 3, 2, 1, 0, 0, 0});
+  INFER_OK(op, "[1,4,4,2];[2,2,2,3]", "[d0_0,4,2,d1_3]");
+
+  // 2x2 input, 2x1 filter, 1x2 stride, [1, 1, 2, 2] padding
+  set_op({{1, 1, 2, 1}}, "EXPLICIT", "NHWC", "HWIO", {0, 0, 1, 1, 2, 2, 0, 0});
+  INFER_OK(op, "[1,2,2,1];[2,1,1,1]", "[d0_0,3,3,d1_3]");
+
+  // Unknown dims in the critical fields lead to partial inference.
+  INFER_OK(op, "[1,4,4,1];[2,1,1,1]", "[d0_0,5,4,d1_3]");
+  INFER_OK(op, "[1,?,4,1];[2,1,1,1]", "[d0_0,?,4,d1_3]");
+  INFER_OK(op, "[1,4,?,1];[2,1,1,1]", "[d0_0,5,?,d1_3]");
+  INFER_OK(op, "[1,4,4,?];[2,1,1,1]", "[d0_0,5,4,d1_3]");
+  INFER_OK(op, "[1,4,4,1];[?,1,1,1]", "[d0_0,?,4,d1_3]");
+  INFER_OK(op, "[1,4,4,1];[2,?,1,1]", "[d0_0,5,?,d1_3]");
+
+  // Explicit padding errors
+  // Negative padding
+  set_op({{1, 1, 1, 1}}, "EXPLICIT", "NHWC", "HWIO", {0, 0, 0, -1, 0, 0, 0, 0});
+  INFER_ERROR("must be nonnegative", op, "[1,2,2,1];[1,1,1,1]");
+
+  // Too little padding (7 explicit paddings instead of 8)
+  set_op({{1, 1, 1, 1}}, "EXPLICIT", "NHWC", "HWIO", {0, 0, 0, 0, 0, 0, 0});
+  INFER_ERROR("must contain 8 values", op, "[1,2,2,1];[1,1,1,1]");
+
+  // Too much padding (9 explicit paddings instead of 8)
+  set_op({{1, 1, 1, 1}}, "EXPLICIT", "NHWC", "HWIO",
+         {0, 0, 0, 0, 0, 0, 0, 0, 0});
+  INFER_ERROR("must contain 8 values", op, "[1,2,2,1];[1,1,1,1]");
+
+  // Padding in batch dimension
+  set_op({{1, 1, 1, 1}}, "EXPLICIT", "NHWC", "HWIO", {1, 0, 0, 0, 0, 0, 0, 0});
+  INFER_ERROR("batch or depth dimensions", op, "[1,2,2,1];[1,1,1,1]");
+
+  // Padding in depth dimension
+  set_op({{1, 1, 1, 1}}, "EXPLICIT", "NHWC", "HWIO", {0, 0, 0, 0, 0, 0, 1, 0});
+  INFER_ERROR("batch or depth dimensions", op, "[1,2,2,1];[1,1,1,1]");
+
+  // Padding explicit_paddings when padding is not EXPLICIT
+  set_op({{1, 1, 1, 1}}, "VALID", "NHWC", "HWIO", {0, 0, 0, 0, 0, 0, 0, 0});
+  INFER_ERROR("must be empty", op, "[1,2,2,1];[1,1,1,1]");
 }
 
 TEST(CommonShapeFnsTest, Conv2DDilatedShapeTest) {
   ShapeInferenceTestOp op("Conv2D");
   auto set_op = [&op](const std::vector<int32>& dilations,
                       const std::vector<int32>& strides, const string& padding,
-                      const string& data_format) {
+                      const string& data_format,
+                      const std::vector<int32>& explicit_paddings = {}) {
     TF_CHECK_OK(NodeDefBuilder("test", "Conv2D")
                     .Input("input", 0, DT_FLOAT)
                     .Input("filter", 0, DT_FLOAT)
                     .Attr("dilations", dilations)
                     .Attr("strides", strides)
                     .Attr("padding", padding)
+                    .Attr("explicit_paddings", explicit_paddings)
                     .Attr("data_format", data_format)
                     .Finalize(&op.node_def));
   };
@@ -628,6 +684,28 @@ TEST(CommonShapeFnsTest, Conv2DDilatedShapeTest) {
   // 4x4 input, 2x2 filter, 2x2 dilations, 1x1 stride
   set_op({{1, 2, 2, 1}}, {{1, 1, 1, 1}}, "SAME", "NHWC");
   INFER_OK(op, "[1,4,4,1];[2,2,1,1]", "[d0_0,d0_1,d0_2,d1_3]");
+
+  // Some tests for "EXPLICIT" padding
+
+  // 4x4 input, 1x1 filter, 2x1 dilations, 1x1 stride, [0, 2, 1, 4] padding
+  set_op({{1, 2, 1, 1}}, {{1, 1, 1, 1}}, "EXPLICIT", "NHWC",
+         {0, 0, 0, 2, 1, 4, 0, 0});
+  INFER_OK(op, "[1,4,4,1];[1,1,1,1]", "[d0_0,6,9,d1_3]");
+
+  // 3x3 input, 2x2 filter, 2x2 dilations, 1x1 stride, [1, 0, 1, 2] padding
+  set_op({{1, 2, 2, 1}}, {{1, 1, 1, 1}}, "EXPLICIT", "NHWC",
+         {0, 0, 1, 0, 1, 2, 0, 0});
+  INFER_OK(op, "[1,3,3,1];[2,2,1,1]", "[d0_0,2,4,d1_3]");
+
+  // 4x4 input, 2x2 filter, 1x2 dilations, 2x2 stride, [3, 2, 1, 0] padding
+  set_op({{1, 1, 2, 1}}, {{1, 2, 2, 1}}, "EXPLICIT", "NHWC",
+         {0, 0, 3, 2, 1, 0, 0, 0});
+  INFER_OK(op, "[1,4,4,1];[2,2,1,1]", "[d0_0,4,2,d1_3]");
+
+  // 4x4 input, 2x2 filter, 2x2 dilations, 1x1 stride, [1, 1, 2, 2] padding
+  set_op({{1, 2, 2, 1}}, {{1, 1, 1, 1}}, "EXPLICIT", "NHWC",
+         {0, 0, 1, 1, 2, 2, 0, 0});
+  INFER_OK(op, "[1,4,4,1];[2,2,1,1]", "[d0_0,4,6,d1_3]");
 }
 
 TEST(CommonShapeFnsTest, Conv3DShapeTest) {
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 94af4ee580b1e7dc1e760ed7d62575e3f8ddb817..b7adfd0c947b60ff9295c867f4afdf756208b126 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -682,8 +682,9 @@ Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
   const OpDef& sig = fdef.signature();
   TF_RETURN_IF_ERROR(ValidateSignatureWithAttrs(sig, attr_values));
 
-  bool ints_on_device = fdef.attr().count("experimental_ints_on_device") != 0 &&
-                        fdef.attr().at("experimental_ints_on_device").b();
+  bool ints_on_device =
+      fdef.attr().count(FunctionLibraryDefinition::kIntsOnDeviceAttr) != 0 &&
+      fdef.attr().at(FunctionLibraryDefinition::kIntsOnDeviceAttr).b();
 
   FunctionInstantiationHelper helper(get_function, result);
   Status s;
@@ -868,7 +869,8 @@ string FunctionLibraryRuntime::ExecutorType(const InstantiateOptions& options,
 string Canonicalize(const string& funcname, AttrSlice attrs,
                     const FunctionLibraryRuntime::InstantiateOptions& options) {
   std::vector<string> entries;
-  entries.reserve(options.target.empty() ? attrs.size() : (attrs.size() + 1));
+  entries.reserve(attrs.size() + static_cast<int>(options.target.empty()) +
+                  options.input_devices.size());
   for (auto p : attrs) {
     if (p.first != kExecutorAttr) {
       entries.push_back(strings::StrCat(p.first, "=", Print(p.second)));
@@ -878,6 +880,14 @@ string Canonicalize(const string& funcname, AttrSlice attrs,
     entries.push_back(
         strings::StrCat("_target", "=", str_util::CEscape(options.target)));
   }
+  for (int i = 0; i < options.input_devices.size(); ++i) {
+    entries.push_back(strings::StrCat(
+        "_input_dev", i, "=", str_util::CEscape(options.input_devices[i])));
+  }
+  for (int i = 0; i < options.output_devices.size(); ++i) {
+    entries.push_back(strings::StrCat(
+        "_output_dev", i, "=", str_util::CEscape(options.output_devices[i])));
+  }
   if (options.overlay_lib) {
     entries.push_back(strings::StrCat(
         "_overlay_lib", "=", reinterpret_cast<uintptr_t>(options.overlay_lib)));
@@ -1491,6 +1501,9 @@ NodeDef FunctionDefHelper::Node::ToNodeDef() const {
   for (const string& d : this->dep) {
     n.add_input(strings::StrCat("^", d));
   }
+  if (!this->device.empty()) {
+    n.set_device(this->device);
+  }
   return n;
 }
 
@@ -1533,6 +1546,7 @@ FunctionDef FunctionDefHelper::Create(
       fdef.mutable_signature()->set_is_stateful(true);
     }
   }
+
   return fdef;
 }
 
@@ -1640,4 +1654,4 @@ Status GetOpGradientCreator(const string& op, Creator* creator) {
 
 }  // end namespace gradient
 
-}  // end namespace tensorflow
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 9cf4b0f4cdf1d4c3604eebcf33bb51274578d73c..79755f599cfc80fa3ccdbadc83cef65667d07250 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/selective_registration.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
@@ -35,6 +36,8 @@ namespace tensorflow {
 
 class CancellationManager;
 class CollectiveExecutor;
+class DeviceSet;
+class Graph;
 class GraphDef;
 class OpKernel;
 class ProcessFunctionLibraryRuntime;
@@ -114,6 +117,7 @@ class FunctionDefHelper {
     std::vector<string> arg;
     std::vector<std::pair<string, AttrValueWrapper>> attr;
     std::vector<string> dep;
+    string device;
 
     NodeDef ToNodeDef() const;
   };
@@ -382,6 +386,8 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   static constexpr const char* const kDeviceArgOp = "_DeviceArg";
   static constexpr const char* const kRetOp = "_Retval";
   static constexpr const char* const kDeviceRetOp = "_DeviceRetval";
+  static constexpr const char* const kIntsOnDeviceAttr =
+      "experimental_ints_on_device";
 
   static constexpr const char* const kGradientOp = "SymbolicGradient";
   static constexpr const char* const kFuncAttr = "f";
@@ -489,6 +495,27 @@ class FunctionLibraryRuntime {
     // instantiated on the local device.
     string target;
 
+    // Should the function be instantiated as a multi-device function?
+    bool is_multi_device_function = false;
+
+    // For multi-device functions, a vector of canonical device names for
+    // function's inputs. The device of resource inputs must be the device
+    // backing the resource, not the CPU device backing the resource handle.
+    // Must have the same length as number of inputs to the function.
+    std::vector<string> input_devices;
+
+    // For multi-device functions, a vector of canonical device names for
+    // function's outputs. The device of resource outputs should be the CPU
+    // device, not the device backing the resource.
+    // If specified, must have the same length as the number of function
+    // outputs.
+    // If not specified, output devices are picked automatically. If operations
+    // producing the output tensors have explicit device specification, they
+    // will be respected. These device specifications must identify a unique
+    // device, i.e.  a general specification like "job:foo" matching multiple
+    // devices will result in an error.
+    std::vector<string> output_devices;
+
     // This interface is EXPERIMENTAL and subject to change.
     //
     // If non-null, the runtime will use `overlay_lib` to resolve
@@ -523,6 +550,17 @@ class FunctionLibraryRuntime {
     // instantiation time, rather than on the first run. This can be used to
     // surface errors earlier.
     bool create_kernels_eagerly = false;
+
+    // If provided, this optimization function will be invoked before
+    // the placer for multi-device functions.
+    std::function<Status(std::vector<string> /*ret_node_names*/,
+                         FunctionLibraryDefinition*, const DeviceSet&,
+                         Device* /*cpu_device*/, std::unique_ptr<Graph>*)>
+        optimize_graph_fn;
+
+    // If set, partitioned functions will be added to `graph_collector`.
+    // `graph_collector` must be alive during the call to Instantiate.
+    GraphCollector* graph_collector = nullptr;
   };
   typedef uint64 Handle;
   virtual Status Instantiate(const string& function_name, AttrSlice attrs,
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index 75d45fa2c84ebc340dfb79b76f7b406d7a099c1f..6a828e9afaaec536d4d5ef51d50dec88fdd6d391 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -505,7 +505,8 @@ TEST(TFunc, IntsOnDeviceArgNotSet) {
 
 TEST(TFunc, IntsOnDeviceArgSet) {
   auto fdef = test::function::XTimesTwoInt32();
-  (*fdef.mutable_attr())["experimental_ints_on_device"].set_b(true);
+  (*fdef.mutable_attr())[FunctionLibraryDefinition::kIntsOnDeviceAttr].set_b(
+      true);
   InstantiationResult result;
   TF_ASSERT_OK(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result));
   EXPECT_EQ(5, result.nodes.size());
diff --git a/tensorflow/core/framework/function_testlib.cc b/tensorflow/core/framework/function_testlib.cc
index 0445c242e95f490a10e9d54f986dd6b281fb6e0a..0bc07d7f91cf63e93b1188b163d00767fa73a3d8 100644
--- a/tensorflow/core/framework/function_testlib.cc
+++ b/tensorflow/core/framework/function_testlib.cc
@@ -135,6 +135,114 @@ FunctionDef XTimesTwo() {
       });
 }
 
+FunctionDef TwoDeviceMult() {
+  const Tensor kTwo = test::AsScalar<int64>(2);
+  const Tensor kThree = test::AsScalar<int64>(3);
+  return FDH::Create(
+      // Name
+      "TwoDeviceMult",
+      // Args
+      {"x: T"},
+      // Return values
+      {"y_cpu: T", "y_gpu: T"},
+      // Attr def
+      {"T: {float, double, int32, int64}"},
+      // Nodes
+      {
+          {{"num_2"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_INT64}}},
+          {{"num_3"}, "Const", {}, {{"value", kThree}, {"dtype", DT_INT64}}},
+          {{"factor_2"},
+           "Cast",
+           {"num_2:output:0"},
+           {{"SrcT", DT_INT64}, {"DstT", "$T"}}},
+          {{"factor_3"},
+           "Cast",
+           {"num_3:output:0"},
+           {{"SrcT", DT_INT64}, {"DstT", "$T"}}},
+          {{"y_cpu"},
+           "Mul",
+           {"x", "factor_2:y:0"},
+           {{"T", "$T"}},
+           {},
+           "/device:CPU:0"},
+          {{"y_gpu"},
+           "Mul",
+           {"x", "factor_3:y:0"},
+           {{"T", "$T"}},
+           {},
+           "/device:GPU:0"},
+      },
+      {{"y_cpu", "y_cpu:z:0"}, {"y_gpu", "y_gpu:z:0"}});
+}
+
+FunctionDef TwoDeviceInputOutput() {
+  const Tensor kTwo = test::AsScalar<float>(2);
+  const Tensor kThree = test::AsScalar<float>(3);
+  return FDH::Create(
+      // Name
+      "TwoDeviceInputOutput",
+      // Args
+      {"x1: T", "x2: T"},
+      // Return values
+      {"y_cpu: T", "y_gpu: T"},
+      // Attr def
+      {"T: {float}"},
+      // Nodes
+      {
+          {{"num_2"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_FLOAT}}},
+          {{"num_3"}, "Const", {}, {{"value", kThree}, {"dtype", DT_FLOAT}}},
+          {{"y_cpu"},
+           "Mul",
+           {"x1", "num_2:output:0"},
+           {{"T", "$T"}},
+           {},
+           "/device:CPU:0"},
+          {{"y_gpu"},
+           "Mul",
+           {"x2", "num_3:output:0"},
+           {{"T", "$T"}},
+           {},
+           "/device:GPU:0"},
+      },
+      {{"y_cpu", "y_cpu:z:0"}, {"y_gpu", "y_gpu:z:0"}});
+}
+
+FunctionDef FuncWithListInput() {
+  const Tensor kTwo = test::AsScalar<float>(2);
+  return FDH::Create(
+      // Name
+      "FuncWithListInput",
+      // Args
+      {"x1: N * T"},
+      // Return values
+      {},
+      // Attr def
+      {"T: {float}", "N: int >= 1"},
+      // Nodes
+      {
+          {{"num_2"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_FLOAT}}},
+      },
+      {});
+}
+
+FunctionDef FuncWithListOutput() {
+  const Tensor kTwo = test::AsScalar<float>(2);
+  return FDH::Create(
+      // Name
+      "FuncWithListOutput",
+      // Args
+      {},
+      // Return values
+      {"y: N * T"},
+      // Attr def
+      {"T: {float}", "N: int >= 1"},
+      // Nodes
+      {
+          {{"num_2"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_FLOAT}}},
+      },
+      {{"y", "num_2:output:0"}});
+}
+
 FunctionDef XAddX() {
   return FDH::Define(
       // Name
@@ -243,6 +351,58 @@ FunctionDef Swap() {
        {{"o1"}, "Identity", {"i0"}, {{"T", "$T"}}}});
 }
 
+FunctionDef EmptyBodySwap() {
+  return FDH::Create(
+      // Name
+      "EmptyBodySwap",
+      // Args
+      {"i0: T", "i1: T"},
+      // Return values
+      {"o0: T", "o1: T"},
+      // Attr def
+      {"T: {float, double}"},
+      // Nodes
+      {},
+      // Output mapping
+      {{"o0", "i1"}, {"o1", "i0"}});
+}
+
+FunctionDef ResourceOutput() {
+  const Tensor kTwo = test::AsScalar<float>(2);
+  return FDH::Create(
+      // Name
+      "ResourceOutput",
+      // Args
+      {"x: float", "y: resource"},
+      // Return values
+      {"y_out: resource", "two_x: float"},
+      // Attr def
+      {},
+      // Nodes
+      {
+          {{"two"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_FLOAT}}},
+          {{"mul"}, "Mul", {"x", "two:output:0"}, {{"T", DT_FLOAT}}, {}},
+      },
+      {{"y_out", "y"}, {"two_x", "mul:z:0"}});
+}
+
+FunctionDef ReadResourceVariable() {
+  return FDH::Create(
+      // Name
+      "ReadResourceVariable",
+      // Args
+      {"x: resource"},
+      // Return values
+      {"y: float"},
+      // Attr def
+      {},
+      // Nodes
+      {
+          {{"read"}, "ReadVariableOp", {"x"}, {{"dtype", DT_FLOAT}}, {}},
+      },
+      {{"y", "read:value:0"}});
+}
+
 FunctionDef InvalidControlFlow() {
   return FDH::Create(
       // Name
diff --git a/tensorflow/core/framework/function_testlib.h b/tensorflow/core/framework/function_testlib.h
index a01743423bbfd5c684e82768ee347f1d0734fc04..28532b29d4509105c4b6b7c203e9e81c5780a58f 100644
--- a/tensorflow/core/framework/function_testlib.h
+++ b/tensorflow/core/framework/function_testlib.h
@@ -63,6 +63,21 @@ GraphDef GDef(gtl::ArraySlice<NodeDef> nodes,
 // x:T -> x * 2.
 FunctionDef XTimesTwo();
 
+// x:T -> cpu(x * 2) + cpu(x * 3).
+FunctionDef TwoDeviceTimesFive();
+
+// x:T -> cpu(x * 2), gpu(x * 3).
+FunctionDef TwoDeviceMult();
+
+// cpu(x):T, gpu(y):T -> cpu(x * 2), gpu(y * 3).
+FunctionDef TwoDeviceInputOutput();
+
+// Function taking a list of Tensors as input.
+FunctionDef FuncWithListInput();
+
+// Function returning a list of Tensors as output.
+FunctionDef FuncWithListOutput();
+
 // x:T -> x + x.
 FunctionDef XAddX();
 
@@ -90,6 +105,15 @@ FunctionDef RandomUniform();
 // x:T, y:T -> y:T, x:T
 FunctionDef Swap();
 
+// x:T, y:T -> y:T, x:T, the body has no nodes.
+FunctionDef EmptyBodySwap();
+
+// x:float, y:resource -> y:resource, 2*x:float.
+FunctionDef ResourceOutput();
+
+// x:resource -> y:float.
+FunctionDef ReadResourceVariable();
+
 // Contains malformed control flow which can't be run by the executor.
 FunctionDef InvalidControlFlow();
 
diff --git a/tensorflow/core/framework/lookup_interface.h b/tensorflow/core/framework/lookup_interface.h
index d33945fd1b0c44264855ed518714eb35faf4b29f..7e5dbe5632becb40fd75763eb4be9dfdc09ec82b 100644
--- a/tensorflow/core/framework/lookup_interface.h
+++ b/tensorflow/core/framework/lookup_interface.h
@@ -131,7 +131,7 @@ class LookupInterface : public ResourceBase {
   // - the default_value tensor shape matches the table's value shape.
   Status CheckFindArguments(const Tensor& keys, const Tensor& default_value);
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat("A lookup table of size: ", size());
   }
 
diff --git a/tensorflow/core/framework/memory_types.cc b/tensorflow/core/framework/memory_types.cc
index 6dff6fe654a51d3c274f7e2c7ca34961eb4f3c2a..8caea351be4442d348f4405bf4385a1349fc197b 100644
--- a/tensorflow/core/framework/memory_types.cc
+++ b/tensorflow/core/framework/memory_types.cc
@@ -62,7 +62,7 @@ void MemoryTypesHelper(const NameRangeMap& name_map,
 
 bool IsFunctionCallOp(const string& op_type) {
   return op_type == "SymbolicGradient" || op_type == "PartitionedCall" ||
-         op_type == "StatefulPartitionedCall";
+         op_type == "StatefulPartitionedCall" || op_type == "While";
 }
 
 }  // namespace
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index bb1d7b6bff84b2c9328c92288a336294fdd17c5e..d9f71853629915bb1a9dbbc79b60d9ea9a3b7cc8 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -310,7 +310,7 @@ class Node {
   std::map<string, std::shared_ptr<Parameter>> parameters_ GUARDED_BY(mu_);
   std::list<std::shared_ptr<Node>> inputs_ GUARDED_BY(mu_);
 
-  // The reference to the output node is not owned so that that deletion of a
+  // The reference to the output node is not owned so that deletion of a
   // node results in recursive deletion of the subtree rooted in the node.
   Node* const output_;
 };
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 44a70500bd0e067d13513d984d5bbb3239940168..789f0fda7526fadc667e51046a344062a9532670 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -118,6 +118,10 @@ OpKernel::OpKernel(OpKernelConstruction* context,
 
 OpKernel::~OpKernel() {}
 
+const uint64 OpKernel::kInitialCostEstimateCycles;
+const uint64 OpKernel::kOpIsExpensiveThresholdCycles;
+const uint64 OpKernel::kCostDecay;
+
 const string& OpKernel::name() const { return def_->name(); }
 const string& OpKernel::type_string() const { return def_->op(); }
 const string& OpKernel::requested_device() const { return def_->device(); }
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 0c911d71f15e8210346cc15587b390a3553f37b2..aa07cbd380a3afc58a2e920d504ebca87267b44d 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -122,14 +122,15 @@ class OpKernel {
   // determine whether an operation should be place in a threadpool.  Operations
   // start out "expensive".
   static const uint64 kInitialCostEstimateCycles = 100 * 1000 * 1000;
-  static const uint64 kOpIsExpensiveThresholdCycles = 25000;
+  static const uint64 kOpIsExpensiveThresholdCycles = 5000;
   static const uint64 kCostDecay = 10;
 
   // Returns true iff this op kernel is considered "expensive". The
   // runtime may use this flag to optimize graph execution for example
   // to "inline" inexpensive kernels.
   virtual bool IsExpensive() {
-    return expensive_ && (cost_estimate_ > kOpIsExpensiveThresholdCycles);
+    return expensive_ && (cost_estimate_.load(std::memory_order_relaxed) >
+                          kOpIsExpensiveThresholdCycles);
   }
 
   // Updates the dynamic cost estimate, which is used to determine whether this
@@ -605,6 +606,9 @@ class OpKernelContext {
     // The session state for this op.
     SessionState* session_state = nullptr;
 
+    // Unique session identifier. Can be empty.
+    string session_handle;
+
     // The tensor store for this op.
     TensorStore* tensor_store = nullptr;
 
@@ -1033,6 +1037,9 @@ class OpKernelContext {
   // An op kernel can access the session state it belongs to.
   SessionState* session_state() const { return params_->session_state; }
 
+  // Unique identifier of the session it belongs to. Can be empty.
+  string session_handle() const { return params_->session_handle; }
+
   // An op kernel can access the tensor store of the run it belongs to.
   TensorStore* tensor_store() const { return params_->tensor_store; }
 
diff --git a/tensorflow/core/framework/op_segment.cc b/tensorflow/core/framework/op_segment.cc
index 75ed4a4eaf231839999efa285c88e2bceda61a07..f7e194baeede8deb529aa7d1f4a0ba3ccc44e792 100644
--- a/tensorflow/core/framework/op_segment.cc
+++ b/tensorflow/core/framework/op_segment.cc
@@ -104,7 +104,8 @@ bool OpSegment::ShouldOwnKernel(FunctionLibraryRuntime* lib,
                                 const string& node_op) {
   // OpSegment should not own kernel if the node is stateless, or a function.
   return lib->IsStateful(node_op) &&
-         lib->GetFunctionLibraryDefinition()->Find(node_op) == nullptr;
+         lib->GetFunctionLibraryDefinition()->Find(node_op) == nullptr &&
+         node_op != "PartitionedCall" && node_op != "StatefulPartitionedCall";
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/framework/ops_util.cc b/tensorflow/core/framework/ops_util.cc
index e8cf014ca03457e4673a14765cee5a05746b901a..4e603b9598fc43f894415b9b8aef6f641e484b6a 100644
--- a/tensorflow/core/framework/ops_util.cc
+++ b/tensorflow/core/framework/ops_util.cc
@@ -30,6 +30,9 @@ Eigen::PaddingType BrainPadding2EigenPadding(Padding padding) {
       return Eigen::PADDING_VALID;
     case Padding::SAME:
       return Eigen::PADDING_SAME;
+    case Padding::EXPLICIT:
+      LOG(FATAL) << "Eigen does not have explicit padding enum "  // Crash OK
+                    "value";
   }
   return Eigen::PADDING_SAME;  // Prevent compiler warning about missing return
 }
diff --git a/tensorflow/core/framework/queue_interface.h b/tensorflow/core/framework/queue_interface.h
index 4ca4416c5ac1471247758cd943d52a7c65f7afaf..9395cce1644f7e8fd09cf40a48b2d7a5abb30bb2 100644
--- a/tensorflow/core/framework/queue_interface.h
+++ b/tensorflow/core/framework/queue_interface.h
@@ -85,11 +85,11 @@ class QueueInterface : public ResourceBase {
   virtual Status MatchesNodeDef(const NodeDef& node_def) = 0;
 
   // Returns the number of elements in the queue.
-  virtual int32 size() = 0;
+  virtual int32 size() const = 0;
 
   virtual const DataTypeVector& component_dtypes() const = 0;
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat("A Queue of size: ", size());
   }
 
diff --git a/tensorflow/core/framework/reader_interface.h b/tensorflow/core/framework/reader_interface.h
index f894acbe1d5119081f088bb091049342b881f340..e47644cb8f27af63e1a96d9c3d44d84e8a55224d 100644
--- a/tensorflow/core/framework/reader_interface.h
+++ b/tensorflow/core/framework/reader_interface.h
@@ -76,7 +76,7 @@ class ReaderInterface : public ResourceBase {
   // Note: Must Reset on error.
   virtual Status RestoreState(const string& state) = 0;
 
-  string DebugString() override { return "a reader"; }
+  string DebugString() const override { return "a reader"; }
 
  protected:
   virtual ~ReaderInterface() {}
diff --git a/tensorflow/core/framework/resource_handle.h b/tensorflow/core/framework/resource_handle.h
index db213669a3f30b3b5587a4d587e2bfb039dacdda..d1f6771bf31e492ac47eb260c7d701d7a6c97b36 100644
--- a/tensorflow/core/framework/resource_handle.h
+++ b/tensorflow/core/framework/resource_handle.h
@@ -67,6 +67,11 @@ class ResourceHandle {
 
   string DebugString() const;
 
+  // GUID for anonymous resources. Resources with this shared_name will have
+  // their shared_name replaced with a GUID at creation time
+  static constexpr const char* ANONYMOUS_NAME =
+      "cd2c89b7-88b7-44c8-ad83-06c2a9158347";
+
  public:
   string device_;
   string container_;
diff --git a/tensorflow/core/framework/resource_mgr.cc b/tensorflow/core/framework/resource_mgr.cc
index 9f3204ab96050a1cc06ab3052741f0044369b83e..6a94ff6642e6f50655083756ae24a2c2b97bc7ec 100644
--- a/tensorflow/core/framework/resource_mgr.cc
+++ b/tensorflow/core/framework/resource_mgr.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <atomic>
+
 #include "tensorflow/core/framework/resource_mgr.h"
 
 #include "tensorflow/core/framework/device_attributes.pb.h"
@@ -26,6 +28,10 @@ limitations under the License.
 #include "tensorflow/core/platform/demangle.h"
 
 namespace tensorflow {
+
+// Used to generate unique names for anonymous variables
+static std::atomic<int64> current_id_;
+
 ResourceHandle MakeResourceHandle(OpKernelContext* ctx, const string& container,
                                   const string& name,
                                   const TypeIndex& type_index) {
@@ -38,7 +44,11 @@ ResourceHandle MakeResourceHandle(OpKernelContext* ctx, const string& container,
     actual_container = ctx->resource_manager()->default_container();
   }
   result.set_container(actual_container);
-  result.set_name(name);
+  if (name == ResourceHandle::ANONYMOUS_NAME) {
+    result.set_name(strings::StrCat("_AnonymousVar", current_id_.fetch_add(1)));
+  } else {
+    result.set_name(name);
+  }
   result.set_hash_code(type_index.hash_code());
   result.set_maybe_type_name(type_index.name());
   return result;
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index 3195cd2e9dccaaf26ac6111a78acdb7278ea92e7..9c381e7d6b4e909689591d3a75bfabbecd886a0d 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -77,7 +77,7 @@ namespace tensorflow {
 class ResourceBase : public core::RefCounted {
  public:
   // Returns a debug string for *this.
-  virtual string DebugString() = 0;
+  virtual string DebugString() const = 0;
 
   // Returns memory used by this resource.
   virtual int64 MemoryUsed() const { return 0; }
@@ -619,20 +619,31 @@ ResourceHandleOp<T>::ResourceHandleOp(OpKernelConstruction* context)
 
 template <typename T>
 void ResourceHandleOp<T>::Compute(OpKernelContext* ctx) {
-  if (!initialized_.load()) {
-    mutex_lock ml(mutex_);
-    // Checking again to see if another thread has initialized the resource.
+  if (name_ == ResourceHandle::ANONYMOUS_NAME) {
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    Tensor handle;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_temp(DT_RESOURCE, TensorShape({}), &handle, attr));
+    handle.scalar<ResourceHandle>()() =
+        MakeResourceHandle<T>(ctx, container_, name_);
+    ctx->set_output(0, handle);
+  } else {
     if (!initialized_.load()) {
-      AllocatorAttributes attr;
-      attr.set_on_host(true);
-      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_RESOURCE, TensorShape({}),
-                                             &resource_, attr));
-      resource_.scalar<ResourceHandle>()() =
-          MakeResourceHandle<T>(ctx, container_, name_);
-      initialized_.store(true);
+      mutex_lock ml(mutex_);
+      // Checking again to see if another thread has initialized the resource.
+      if (!initialized_.load()) {
+        AllocatorAttributes attr;
+        attr.set_on_host(true);
+        OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_RESOURCE, TensorShape({}),
+                                               &resource_, attr));
+        resource_.scalar<ResourceHandle>()() =
+            MakeResourceHandle<T>(ctx, container_, name_);
+        initialized_.store(true);
+      }
     }
+    ctx->set_output(0, resource_);
   }
-  ctx->set_output(0, resource_);
 }
 
 template <typename T>
diff --git a/tensorflow/core/framework/resource_mgr_test.cc b/tensorflow/core/framework/resource_mgr_test.cc
index 7c7f0af0ce46abbde5b66facf4d33db47f9773b8..1c785736e60b2f03899924f34a207066582a590e 100644
--- a/tensorflow/core/framework/resource_mgr_test.cc
+++ b/tensorflow/core/framework/resource_mgr_test.cc
@@ -32,7 +32,7 @@ class Resource : public ResourceBase {
   explicit Resource(const string& label) : label_(label) {}
   ~Resource() override {}
 
-  string DebugString() override { return strings::StrCat("R/", label_); }
+  string DebugString() const override { return strings::StrCat("R/", label_); }
 
  private:
   string label_;
@@ -43,7 +43,7 @@ class Other : public ResourceBase {
   explicit Other(const string& label) : label_(label) {}
   ~Other() override {}
 
-  string DebugString() override { return strings::StrCat("O/", label_); }
+  string DebugString() const override { return strings::StrCat("O/", label_); }
 
  private:
   string label_;
@@ -245,7 +245,7 @@ class StubDevice : public DeviceBase {
 // Empty stub resource for testing resource handles.
 class StubResource : public ResourceBase {
  public:
-  string DebugString() override { return ""; }
+  string DebugString() const override { return ""; }
   int value_{0};
 };
 
@@ -305,7 +305,7 @@ TEST(ResourceHandleTest, DifferentDevice) {
 // Other stub resource to test type-checking of resource handles.
 class OtherStubResource : public ResourceBase {
  public:
-  string DebugString() override { return ""; }
+  string DebugString() const override { return ""; }
 };
 
 TEST(ResourceHandleTest, DifferentType) {
diff --git a/tensorflow/core/framework/resource_op_kernel_test.cc b/tensorflow/core/framework/resource_op_kernel_test.cc
index c1e503dc57643d2023d89f317a6c5ff643a3c60b..7a2a87045bf20970a6a996cb9d32b264af0662c7 100644
--- a/tensorflow/core/framework/resource_op_kernel_test.cc
+++ b/tensorflow/core/framework/resource_op_kernel_test.cc
@@ -46,7 +46,7 @@ class StubDevice : public DeviceBase {
 // Stub resource for testing resource op kernel.
 class StubResource : public ResourceBase {
  public:
-  string DebugString() override { return ""; }
+  string DebugString() const override { return ""; }
   int code;
 };
 
diff --git a/tensorflow/core/framework/resource_var.h b/tensorflow/core/framework/resource_var.h
index f5de5dba8854adcfd5b94447da3ba42566a26bd8..9387b6c23c77dadfd423865b23bc7dc5fdf41672 100644
--- a/tensorflow/core/framework/resource_var.h
+++ b/tensorflow/core/framework/resource_var.h
@@ -67,7 +67,7 @@ class Var : public ResourceBase {
   mutex* mu() { return &mu_; }
   Tensor* tensor() { return &tensor_; }
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat(DataTypeString(tensor_.dtype()), "/",
                            tensor_.shape().DebugString());
   }
diff --git a/tensorflow/core/framework/stats_aggregator.h b/tensorflow/core/framework/stats_aggregator.h
index af53ed0a3ca64aefe310db3b2d07ce6a18afa181..7c960840d7446889bee1ba22cdbb4af072acd53e 100644
--- a/tensorflow/core/framework/stats_aggregator.h
+++ b/tensorflow/core/framework/stats_aggregator.h
@@ -83,7 +83,7 @@ class StatsAggregatorResource : public ResourceBase {
     return stats_aggregator_;
   }
 
-  string DebugString() { return "StatsAggregatorResource"; }
+  string DebugString() const override { return "StatsAggregatorResource"; }
 
  private:
   const std::shared_ptr<StatsAggregator> stats_aggregator_;
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 7e841489eb35d4ec3d18fe255472107ef9d60efe..0c96ec8168752eaef2388530176d4c0611f17f09 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -932,10 +932,18 @@ namespace {
 // logic is so simple we can just replicate it here, where it is close to its
 // usage and easy to change later. And there's the extra benefit of not
 // accessing an 'internal' namespace.
-inline const strings::AlphaNum& PrintOneElement(const strings::AlphaNum& a) {
+inline const strings::AlphaNum& PrintOneElement(const strings::AlphaNum& a,
+                                                bool print_v2) {
   return a;
 }
-inline float PrintOneElement(const Eigen::half& h) {
+inline string PrintOneElement(const string& a, bool print_v2) {
+  if (print_v2) {
+    return "\"" + str_util::CEscape(a) + "\"";
+  } else {
+    return str_util::CEscape(a);
+  }
+}
+inline float PrintOneElement(const Eigen::half& h, bool print_v2) {
   return static_cast<float>(h);
 }
 
@@ -957,7 +965,7 @@ void PrintOneDim(int dim_index, const gtl::InlinedVector<int64, 4>& shape,
         return;
       }
       if (i > 0) strings::StrAppend(result, " ");
-      strings::StrAppend(result, PrintOneElement(data[(*data_index)++]));
+      strings::StrAppend(result, PrintOneElement(data[(*data_index)++], false));
     }
     return;
   }
@@ -1000,7 +1008,7 @@ void PrintOneDimV2(int dim_index, const gtl::InlinedVector<int64, 4>& shape,
   // We have recursed beyond all the dimensions into a single element
   // of the tensor.
   if (dim_index == num_dims) {
-    strings::StrAppend(result, PrintOneElement(data[data_index]));
+    strings::StrAppend(result, PrintOneElement(data[data_index], true));
     return;
   }
 
@@ -1048,7 +1056,7 @@ string SummarizeArray(int64 limit, int64 num_elts,
   if (shape.empty()) {
     for (int64 i = 0; i < limit; ++i) {
       if (i > 0) strings::StrAppend(&ret, " ");
-      strings::StrAppend(&ret, PrintOneElement(array[i]));
+      strings::StrAppend(&ret, PrintOneElement(array[i], print_v2));
     }
     if (num_elts > limit) strings::StrAppend(&ret, "...");
     return ret;
@@ -1123,6 +1131,9 @@ string Tensor::SummarizeValue(int64 max_entries, bool print_v2) const {
       // will emit "1 0..." which is more compact.
       return SummarizeArray<bool>(limit, num_elts, shape_, data, print_v2);
       break;
+    case DT_STRING:
+      return SummarizeArray<string>(limit, num_elts, shape_, data, print_v2);
+      break;
     default: {
       // All irregular cases
       string ret;
@@ -1134,9 +1145,6 @@ string Tensor::SummarizeValue(int64 max_entries, bool print_v2) const {
       for (size_t i = 0; i < limit; ++i) {
         if (i > 0) strings::StrAppend(&ret, " ");
         switch (dtype()) {
-          case DT_STRING:
-            strings::StrAppend(&ret, str_util::CEscape(flat<string>()(i)));
-            break;
           case DT_VARIANT: {
             const Variant& v = flat<Variant>()(i);
             strings::StrAppend(&ret, v.DebugString());
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index 713f91fe04c6fe498209d88193f6fbb1729ec57c..d4aed387610579dc02a7566fdda44d042d203c35 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -1370,7 +1370,7 @@ TEST(SummarizeValue, STRING) {
   EXPECT_EQ("one two three four five", x.SummarizeValue(16));
   x = MkTensor<string>(DT_STRING, TensorShape({5, 1, 5}),
                        {"one", "two", "three", "four", "five"});
-  EXPECT_EQ("one two three four five one...", x.SummarizeValue(6));
+  EXPECT_EQ("[[one two three four five]][[one...]]...", x.SummarizeValue(6));
 }
 
 TEST(SummarizeValue, INT32_PRINT_V2) {
@@ -1423,11 +1423,16 @@ TEST(SummarizeValue, BOOL_PRINT_V2) {
 TEST(SummarizeValue, STRING_PRINT_V2) {
   Tensor x = MkTensor<string>(DT_STRING, TensorShape({5}),
                               {"one", "two", "three", "four", "five"});
-  EXPECT_EQ("[one two three four five]", x.SummarizeValue(16, true));
-  EXPECT_EQ("[one two three four five]", x.SummarizeValue(-1, true));
-  x = MkTensor<string>(DT_STRING, TensorShape({5, 1, 5}),
+  EXPECT_EQ("[\"one\" \"two\" \"three\" \"four\" \"five\"]",
+            x.SummarizeValue(16, true));
+  EXPECT_EQ("[\"one\" \"two\" \"three\" \"four\" \"five\"]",
+            x.SummarizeValue(-1, true));
+  EXPECT_EQ("[\"one\" \"two\" ... \"four\" \"five\"]",
+            x.SummarizeValue(2, true));
+  x = MkTensor<string>(DT_STRING, TensorShape({2, 2}),
                        {"one", "two", "three", "four", "five"});
-  EXPECT_EQ("[one two three four five one...]", x.SummarizeValue(6, true));
+  EXPECT_EQ("[[\"one\" \"two\"]\n [\"three\" \"four\"]]",
+            x.SummarizeValue(16, true));
 }
 
 void BM_CreateAndDestroy(int iters) {
diff --git a/tensorflow/core/framework/tensor_testutil.h b/tensorflow/core/framework/tensor_testutil.h
index 31630028516a4f7896986220f4ff0bd8f09fd37a..b58292b3b0225e6f2df7710347019a1c6d7bc512 100644
--- a/tensorflow/core/framework/tensor_testutil.h
+++ b/tensorflow/core/framework/tensor_testutil.h
@@ -206,7 +206,7 @@ struct Expector<T, true> {
     const T* b = y.flat<T>().data();
     for (int i = 0; i < size; ++i) {
       EXPECT_TRUE(Near(a[i], b[i], abs_err))
-          << "a = " << a[i] << " b = " << b << " index = " << i;
+          << "a = " << a[i] << " b = " << b[i] << " index = " << i;
     }
   }
 };
diff --git a/tensorflow/core/graph/algorithm.cc b/tensorflow/core/graph/algorithm.cc
index 9b4200e0b47ec37ddbef1e375e1955c6ec814caf..25bd3516a16490bd2b3b535f880757b001a5f017 100644
--- a/tensorflow/core/graph/algorithm.cc
+++ b/tensorflow/core/graph/algorithm.cc
@@ -222,11 +222,12 @@ bool FixupSourceAndSinkEdges(Graph* g) {
   bool changed = false;
   for (Node* n : g->nodes()) {
     if (!n->IsSource() && n->in_edges().empty()) {
-      g->AddControlEdge(g->source_node(), n);
+      g->AddControlEdge(g->source_node(), n,
+                        true /* skip test for duplicates */);
       changed = true;
     }
     if (!n->IsSink() && n->out_edges().empty()) {
-      g->AddControlEdge(n, g->sink_node());
+      g->AddControlEdge(n, g->sink_node(), true /* skip test for duplicates */);
       changed = true;
     }
   }
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index f213eb7c107c92be55d4efcf7b8551f1ac282154..00c7a5b091c0dbfbcf08a3611faaab4d41a08152 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -58,22 +59,15 @@ struct DupRecvKey {
   int src_output_slot;       // Edge's src node output slot
   GraphDef* dst_graph;       // Edge's dst node is in this subgraph
   bool recv_output_on_host;  // The output of recv is on host
-};
 
-struct DupRecvKeyHash {
-  size_t operator()(const DupRecvKey& k) const {
-    size_t h = Hash64(reinterpret_cast<const char*>(&k.src_node_id),
-                      sizeof(k.src_node_id), k.src_output_slot);
-    h = Hash64(reinterpret_cast<const char*>(&k.dst_graph), sizeof(k.dst_graph),
-               h);
-    h = Hash64(reinterpret_cast<const char*>(&k.recv_output_on_host),
-               sizeof(k.recv_output_on_host), h);
-    return h;
+  template <typename H>
+  friend H AbslHashValue(H h, const DupRecvKey& c) {
+    return H::combine(std::move(h), c.src_node_id, c.src_output_slot,
+                      reinterpret_cast<std::uintptr_t>(c.dst_graph),
+                      c.recv_output_on_host);
   }
-};
 
-struct DupRecvKeyEq {
-  bool operator()(const DupRecvKey& x, const DupRecvKey& y) const {
+  friend bool operator==(const DupRecvKey& x, const DupRecvKey& y) {
     return (x.src_node_id == y.src_node_id) &&
            (x.src_output_slot == y.src_output_slot) &&
            (x.dst_graph == y.dst_graph) &&
@@ -88,19 +82,26 @@ struct RecvInfo {
   int64 start_time;
 };
 
-typedef std::unordered_map<DupRecvKey, RecvInfo, DupRecvKeyHash, DupRecvKeyEq>
-    DupRecvTable;
+typedef absl::flat_hash_map<DupRecvKey, RecvInfo> DupRecvTable;
 
-struct PairIntHash {
- public:
-  std::size_t operator()(const std::pair<int, int>& x) const {
-    return std::hash<int>()(x.first) ^ std::hash<int>()(x.second);
-  }
-};
 // A map used to store memory types for the inputs/outputs of every node.
 // The key is a pair of ints consisting of a node id and input/output index.
-typedef std::unordered_map<std::pair<int, int>, MemoryType, PairIntHash>
-    MemoryTypeMap;
+// TODO(power): migrate back to std::pair when absl::Hash is fixed for MSVC.
+struct NodePort {
+  int node_id;
+  int index;
+
+  friend bool operator==(const NodePort& x, const NodePort& y) {
+    return x.node_id == y.node_id && x.index == y.index;
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const NodePort& c) {
+    return H::combine(std::move(h), c.node_id, c.index);
+  }
+};
+
+typedef absl::flat_hash_map<NodePort, MemoryType> MemoryTypeMap;
 
 // We collect the following information about the graph before performing
 // graph partitioning.
@@ -564,10 +565,10 @@ Status BuildMemoryDeviceInfo(const Graph& g, GraphInfo* info) {
 
     int node_id = node->id();
     info->device_types[node_id] = DeviceType(parsed.type);
-    for (size_t i = 0; i < input_memory_types.size(); ++i) {
+    for (int i = 0; i < input_memory_types.size(); ++i) {
       info->input_types[{node_id, i}] = input_memory_types[i];
     }
-    for (size_t i = 0; i < output_memory_types.size(); ++i) {
+    for (int i = 0; i < output_memory_types.size(); ++i) {
       info->output_types[{node_id, i}] = output_memory_types[i];
     }
   }
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 9495132f4a6b4fa6383566e4fc7e8b7ab2dbbd21..0f7a81110c5da336c87dd51757780cfbf38201cf 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -843,6 +843,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     CHECK_NOTNULL(m);
     Node* n = nullptr;
 
+    DataType T_m;
+    TF_CHECK_OK(GetNodeAttr(m->def(), "T", &T_m));
+
+    // Don't try to merge if datatype is not DT_FLOAT
+    if (T_m != DT_FLOAT) return n;
+
     if (m->type_string() == csinfo_.bias_add) {
       // If a is BiasAdd, then Conv2D is 0th input of BiasAdd.
       TF_CHECK_OK(m->input_node(0, &n));
@@ -877,6 +883,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     DCHECK(m);
     Node* n = nullptr;
 
+    DataType T_m;
+    TF_CHECK_OK(GetNodeAttr(m->def(), "T", &T_m));
+
+    // Don't try to merge if datatype is not DT_FLOAT
+    if (T_m != DT_FLOAT) return n;
+
     const Node* conv_node;
     if (m->type_string() == csinfo_.pad) {
       // If m is Pad, then Conv2D is the output of Pad.
@@ -935,6 +947,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     CHECK_NOTNULL(m);
     Node* n = nullptr;
 
+    DataType T_m;
+    TF_CHECK_OK(GetNodeAttr(m->def(), "T", &T_m));
+
+    // Don't try to merge if datatype is not DT_FLOAT
+    if (T_m != DT_FLOAT) return n;
+
     if (m->type_string() == csinfo_.bias_add_grad) {
       // Get 1st input 'g' of BiasAddGrad.
       Node* g = nullptr;
@@ -2441,7 +2459,7 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
   std::vector<int32> strides;
   std::vector<int32> dilations;
   string data_format_pred, data_format_succ;
-  bool use_cudnn_on_gnu;
+  bool use_cudnn_on_gpu;
   TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred));
   TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ));
   TF_CHECK_OK(GetNodeAttr(pred->def(), "padding", &padding));
@@ -2449,7 +2467,7 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
   TF_CHECK_OK(GetNodeAttr(pred->def(), "dilations", &dilations));
   TF_CHECK_OK(GetNodeAttr(pred->def(), "data_format", &data_format_pred));
   TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
-  TF_CHECK_OK(GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
   // We check to ensure that data formats of both succ and pred are same.
   // We expect them to be same, so we can enforce this as assert.
   // But assert can be too strict, so we enforce this as a check.
@@ -2589,7 +2607,7 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr<Graph>* g,
   std::vector<int32> strides;
   std::vector<int32> dilations;
   string data_format_pred, data_format_succ;
-  bool use_cudnn_on_gnu;
+  bool use_cudnn_on_gpu;
   TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred));
   TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ));
   TF_CHECK_OK(GetNodeAttr(succ->def(), "padding", &padding));
@@ -2598,7 +2616,7 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr<Graph>* g,
   // Data format for pad is not available and not necessary, thus
   // dont need to match data format for Pad
   TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
-  TF_CHECK_OK(GetNodeAttr(succ->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
   // Check if the data types and devices of both succ and pred are the same.
   // Assert is not used,  because it can be too strict.
   // Don't need to check for data formats because it is not available in Pad.
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index 6de12192ba87fe2020a5ae2474dc1fd59b2ac366..55d642612bd63f956c4764bb81fa058998a97bd1 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -27,6 +27,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -62,6 +63,36 @@ tf_cuda_library(
     ],
 )
 
+cc_library(
+    name = "graph_topology_view",
+    srcs = ["graph_topology_view.cc"],
+    hdrs = ["graph_topology_view.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_view",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cc_test(
+    name = "graph_topology_view_test",
+    srcs = ["graph_topology_view_test.cc"],
+    deps = [
+        ":graph_topology_view",
+        ":graph_view",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "graph_view",
     srcs = ["graph_view.cc"],
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.cc b/tensorflow/core/grappler/clusters/virtual_cluster.cc
index dbd8f26c286f07107a63e9c745c442b171f29aaa..118f74e8b01171e3780317b4ea36750c66a22b98 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.cc
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.cc
@@ -67,13 +67,17 @@ Status VirtualCluster::Run(const GraphDef& graph,
                            const std::vector<string>& fetch,
                            RunMetadata* metadata) {
   // Initialize a virtual scheduler to process the graph. Make sure to use
-  // static shape inference to prevent the schedulrer from calling the Run
-  // method on the cluster, and create an infinite loop.
+  // static shape inference to prevent the scheduler from calling the Run
+  // method on the cluster and creating an infinite loop.
   GrapplerItem item;
   item.graph = graph;
   item.feed = feed;
   item.fetch = fetch;
-  VirtualScheduler scheduler(true, this, node_manager_.get());
+  // Note that we do not use aggressive shape inference to preserve unknown
+  // shapes from the input graph.
+  VirtualScheduler scheduler(/*use_static_shapes=*/true,
+                             /*use_aggressive_shape_inference=*/false, this,
+                             node_manager_.get());
   TF_RETURN_IF_ERROR(scheduler.Init(&item));
 
   if (metadata) {
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 15dc7074b9c035bc31e3b1ed6132329b1c7768f5..92294379b5c058d7b17a851dc0fdc5039b86e288 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -171,6 +171,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
 
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
index b7804ffaa5378c67028b39819a07fc00719c9896..09cddad8ba475062cdb7750b401a0f2f3672b118 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
@@ -104,19 +104,19 @@ AnalyticalCostEstimator::AnalyticalCostEstimator(Cluster* cluster,
                                                  bool use_static_shapes)
     : AnalyticalCostEstimator(
           cluster, absl::make_unique<OpLevelCostEstimator>(),
-          ReadyNodeManagerFactory("FirstReady"), use_static_shapes, nullptr) {}
+          ReadyNodeManagerFactory("FirstReady"), use_static_shapes) {}
 
 AnalyticalCostEstimator::AnalyticalCostEstimator(
     Cluster* cluster, std::unique_ptr<OpLevelCostEstimator> node_estimator,
-    std::unique_ptr<ReadyNodeManager> node_manager, bool use_static_shapes,
-    RunMetadata* run_metadata)
+    std::unique_ptr<ReadyNodeManager> node_manager, bool use_static_shapes)
     : cluster_(cluster),
       node_estimator_(std::move(node_estimator)),
       node_manager_(std::move(node_manager)),
-      use_static_shapes_(use_static_shapes),
-      run_metadata_(run_metadata) {
-  scheduler_ = absl::make_unique<VirtualScheduler>(use_static_shapes_, cluster_,
-                                                   node_manager_.get());
+      use_static_shapes_(use_static_shapes) {
+  // Use aggressive static shape inference to minimize unknown shapes.
+  scheduler_ = absl::make_unique<VirtualScheduler>(
+      use_static_shapes_,
+      /*use_aggressive_shape_inference=*/true, cluster_, node_manager_.get());
 }
 
 Status AnalyticalCostEstimator::Initialize(const GrapplerItem& item) {
@@ -128,6 +128,18 @@ Status AnalyticalCostEstimator::Initialize(const GrapplerItem& item) {
 Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
                                              CostGraphDef* cost_graph,
                                              Costs* costs) const {
+  RunMetadata run_metadata;
+  auto s =
+      PredictCostsAndReturnRunMetadata(optimized_graph, &run_metadata, costs);
+  if (s.ok() && cost_graph) {
+    cost_graph->Swap(run_metadata.mutable_cost_graph());
+  }
+  return s;
+}
+
+Status AnalyticalCostEstimator::PredictCostsAndReturnRunMetadata(
+    const GraphDef& optimized_graph, RunMetadata* run_metadata,
+    Costs* costs) const {
   GrapplerItem item = item_;
   item.graph = optimized_graph;
 
@@ -138,7 +150,9 @@ Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
   }
 
   gtl::FlatMap<string, CostGraphDef::Node*> name_to_cost_node;
-  if (cost_graph) {
+  CostGraphDef* cost_graph = nullptr;
+  if (run_metadata) {
+    cost_graph = run_metadata->mutable_cost_graph();
     // TODO(pcma): Clear nodes in cost_graph after we make sure we always pass
     // in an empty cost_graph (a non-empty but incomplete cost_graph will cause
     // problems, e.g., no node_id in cost_graph)
@@ -179,18 +193,13 @@ Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
     }
   }
 
-  *costs = scheduler_->Summary(run_metadata_);
-  // run_metadata_ gets step_stats and parition_graphs from Summary.
-  // Note that cost_graph could already point to the cost_graph field of
-  // run_metadata_, since both are set by the caller.
-  if (run_metadata_ && cost_graph &&
-      run_metadata_->mutable_cost_graph() != cost_graph)
-    *run_metadata_->mutable_cost_graph() = *cost_graph;
+  // run_metadata gets step_stats and partition_graphs from Summary.
+  *costs = scheduler_->Summary(run_metadata);
 
   if (VLOG_IS_ON(1)) {
     bool verbose = VLOG_IS_ON(2);
-    if (run_metadata_) {
-      VLOG(1) << GetStatsStringFromRunMetadata(*run_metadata_, verbose);
+    if (run_metadata) {
+      VLOG(1) << GetStatsStringFromRunMetadata(*run_metadata, verbose);
     } else {
       RunMetadata run_metadata;
       scheduler_->GenerateRunMetadata(&run_metadata);
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator.h b/tensorflow/core/grappler/costs/analytical_cost_estimator.h
index 2629672459c512a22a861bd5c0dfe0207afc38a0..6275c621988555826899f90c899b6cab227f2b73 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.h
@@ -34,21 +34,16 @@ class Cluster;
 struct GrapplerItem;
 
 // Estimate the cost of running a Grappler item based on the theoretical
-// performance of the hardware that will run the model.
+// performance of the hardware that will run the model. Note that this
+// internally uses aggressive shape inference with static shape inference.
 class AnalyticalCostEstimator : public CostEstimator {
  public:
   // Does not take ownership of cluster.
   AnalyticalCostEstimator(Cluster* cluster, bool use_static_shapes);
-  // Does not take ownership of cluster or run_metadata
-  //
-  // When metadata is provided, step_stats and partition_graphs fields will
-  // always be filled during PredictCosts, and the cost_graph field of metadata
-  // will be filled only when cost_graph is not nullptr when invoking
-  // PredictCosts.
   AnalyticalCostEstimator(Cluster* cluster,
                           std::unique_ptr<OpLevelCostEstimator> node_estimator,
                           std::unique_ptr<ReadyNodeManager> node_manager,
-                          bool use_static_shapes, RunMetadata* run_metadata);
+                          bool use_static_shapes);
   ~AnalyticalCostEstimator() override {}
 
   // Initializes the estimator for the specified grappler item.
@@ -61,6 +56,10 @@ class AnalyticalCostEstimator : public CostEstimator {
   Status PredictCosts(const GraphDef& optimized_graph, CostGraphDef* cost_graph,
                       Costs* cost) const override;
 
+  Status PredictCostsAndReturnRunMetadata(const GraphDef& optimized_graph,
+                                          RunMetadata* run_metadata,
+                                          Costs* cost) const override;
+
   const VirtualScheduler* GetScheduler() const { return scheduler_.get(); }
 
  private:
@@ -70,8 +69,6 @@ class AnalyticalCostEstimator : public CostEstimator {
   std::unique_ptr<ReadyNodeManager> node_manager_;
   bool use_static_shapes_;
   std::unique_ptr<VirtualScheduler> scheduler_;
-
-  RunMetadata* run_metadata_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
index a9a1abfa989c9d8276b6ae263b95e7a71be41c8a..8a5634437827b207fef7ece85613d6ec145b5fb8 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
@@ -98,9 +98,10 @@ TEST_F(AnalyticalCostEstimatorTest, SimpleTest) {
   AnalyticalCostEstimator estimator(cluster_.get(), true);
   TF_ASSERT_OK(estimator.Initialize(item));
 
-  CostGraphDef cost_graph;
+  RunMetadata run_metadata;
   Costs summary;
-  TF_ASSERT_OK(estimator.PredictCosts(item.graph, &cost_graph, &summary));
+  TF_ASSERT_OK(estimator.PredictCostsAndReturnRunMetadata(
+      item.graph, &run_metadata, &summary));
 
   EXPECT_EQ(Costs::NanoSeconds(9151), summary.execution_time);
   // Note there are totally 17 nodes (RandomUniform creates 2 nodes), but
diff --git a/tensorflow/core/grappler/costs/cost_estimator.h b/tensorflow/core/grappler/costs/cost_estimator.h
index e3b3a36b096da807d05bee50d52a7a5c37884b52..725d81a88106835c57bf02d6ba9070aef19c8a91 100644
--- a/tensorflow/core/grappler/costs/cost_estimator.h
+++ b/tensorflow/core/grappler/costs/cost_estimator.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <cmath>
 #include <unordered_map>
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 class GraphDef;
@@ -223,6 +224,19 @@ class CostEstimator {
   // not.
   virtual Status PredictCosts(const GraphDef& optimized_graph,
                               CostGraphDef* cost_graph, Costs* cost) const = 0;
+
+  // TODO(dyoon): Delete PredictCosts() with CostGraphDef as RunMetadata is a
+  // superset of CostGraphDef.
+  // Same method, but returns RunMetadata.
+  virtual Status PredictCostsAndReturnRunMetadata(
+      const GraphDef& optimized_graph, RunMetadata* run_metadata,
+      Costs* cost) const {
+    CostGraphDef* cost_graph = nullptr;
+    if (run_metadata) {
+      cost_graph = run_metadata->mutable_cost_graph();
+    }
+    return PredictCosts(optimized_graph, cost_graph, cost);
+  }
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index d69997989643972b4dfe7159ecbd9fa0901c7381..e869f7830c2fceac634a7c3fd9796a95307211c2 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -425,9 +425,11 @@ NodeDef MakeConstNodeDefFromShape(InferenceContext* ic,
 // information is refined.
 class TopoQueue {
  public:
-  explicit TopoQueue(const std::unordered_map<const NodeDef*, int>& topo_order)
-      : topo_order_(topo_order) {}
+  explicit TopoQueue(const std::vector<const NodeDef*>& topo_order)
+      : topo_order_(TopoOrder(topo_order)) {}
+
   void push(const NodeDef* n) { queue_.emplace(n, topo_order_.at(n)); }
+
   const NodeDef* pop() {
     CHECK(!empty());
     auto it = queue_.begin();
@@ -448,7 +450,18 @@ class TopoQueue {
       return lhs.second < rhs.second;
     }
   };
-  const std::unordered_map<const NodeDef*, int>& topo_order_;
+
+  const std::unordered_map<const NodeDef*, int> TopoOrder(
+      const std::vector<const NodeDef*>& topo_order) const {
+    std::unordered_map<const NodeDef*, int> map;
+    map.reserve(topo_order.size());
+    for (int i = 0; i < topo_order.size(); ++i) {
+      map.emplace(topo_order[i], i);
+    }
+    return map;
+  }
+
+  const std::unordered_map<const NodeDef*, int> topo_order_;
   std::set<NodeAndId, OrderByIdAscending> queue_;
 };
 
@@ -656,7 +669,7 @@ class SymbolicShapeRefiner {
     ctx->output_tensor_protos.resize(grappler_function_item.output_size(),
                                      nullptr);
     for (auto const& out_arg : grappler_function_item.outputs()) {
-      if (out_arg.output_tensors.size() > 1) {
+      if (out_arg.output_nodes.size() > 1) {
         // TODO(jmdecker): Handle case of multiple output tensors
         return errors::Unimplemented(
             "Output arguments with multiple output tensors are not yet "
@@ -665,7 +678,7 @@ class SymbolicShapeRefiner {
 
       // It is guaranteed that output_tensors does not contain any control
       // inputs, so port_id >= 0.
-      TensorId out_tensor = ParseTensorName(out_arg.output_tensors[0]);
+      TensorId out_tensor = ParseTensorName(out_arg.output_nodes[0]);
 
       const NodeDef* retnode = gv.GetNode(out_tensor.node());
       if (retnode == nullptr) {
@@ -1074,15 +1087,20 @@ class SymbolicShapeRefiner {
         c->output_tensor_protos.size() < ic->num_outputs()) {
       return false;
     } else {
+      // Checks if we can get output value via either output_tensor_proto or
+      // output_tensors_as_shapes.
       for (int i = 0; i < ic->num_outputs(); i++) {
-        if (c->output_tensor_protos.size() <= i ||
-            c->output_tensor_protos[i] == nullptr) {
-          return false;
+        if (c->output_tensor_protos.size() > i &&
+            c->output_tensor_protos[i] != nullptr) {
+          continue;
         }
-        if (c->output_tensors_as_shapes.size() <= i ||
-            !ic->FullyDefined(c->output_tensors_as_shapes[i])) {
-          return false;
+        if (c->output_tensors_as_shapes.size() > i &&
+            ic->FullyDefined(c->output_tensors_as_shapes[i])) {
+          continue;
         }
+
+        // Unknown for output[i].
+        return false;
       }
     }
     return true;
@@ -1450,9 +1468,9 @@ class SymbolicShapeRefiner {
       // Due to the cost of EvaluateNode(), we run it only for certain op types
       // (white listed) and small integer tensors.
 
-      const int max_elelment_size = 17;  // Max up to 4x4 matrix or similar.
+      const int max_element_size = 17;  // Max up to 4x4 matrix or similar.
       if (AllOutputValuesKnown(c) || !AllInputValuesKnown(c) ||
-          !ShouldUpdateOutputValues(c, max_elelment_size)) {
+          !ShouldUpdateOutputValues(c, max_element_size)) {
         return Status::OK();
       }
       UpdateOutputValues(node, c).IgnoreError();  // This is optional.
@@ -1970,7 +1988,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds,
   }
 
   std::unordered_map<const NodeDef*, const NodeDef*> resource_handles;
-  std::vector<std::pair<const NodeDef*, const NodeDef*>> extra_deps;
+  std::vector<TopologicalDependency> extra_deps;
   for (const auto& resource : resources) {
     for (const NodeDef* src : resource.second.first) {
       resource_handles[src] = resource.first;
@@ -1982,8 +2000,8 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds,
     }
   }
 
-  std::unordered_map<const NodeDef*, int> topo_order;
-  Status s = ComputeTopologicalOrder(item_.graph, &topo_order, &extra_deps);
+  std::vector<const NodeDef*> topo_order;
+  Status s = ComputeTopologicalOrder(item_.graph, extra_deps, &topo_order);
   if (!s.ok()) {
     if (extra_deps.empty()) {
       return s;
@@ -1992,8 +2010,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds,
       // order. This will make the shape inference less precise but since this
       // isn't common it's not worth to figure out where to break the loop and
       // do a proper relaxation.
-      TF_RETURN_IF_ERROR(
-          ComputeTopologicalOrder(item_.graph, &topo_order, nullptr));
+      TF_RETURN_IF_ERROR(ComputeTopologicalOrder(item_.graph, &topo_order));
     }
   }
 
diff --git a/tensorflow/core/grappler/costs/graph_properties_testdata/large_function_graph.pbtxt b/tensorflow/core/grappler/costs/graph_properties_testdata/large_function_graph.pbtxt
index 415c347a1d2d563099490b780e10008508259027..d4e23e901a46a8524c2b2ef7d2311b9cf48850e7 100644
--- a/tensorflow/core/grappler/costs/graph_properties_testdata/large_function_graph.pbtxt
+++ b/tensorflow/core/grappler/costs/graph_properties_testdata/large_function_graph.pbtxt
@@ -511,6 +511,13 @@ library {
           s: "VALID"
         }
       }
+      attr {
+        key: "explicit_paddings"
+        value {
+          list {
+          }
+        }
+      }
       attr {
         key: "strides"
         value {
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 55eb391d2b344778df7d23528dbe42596321b95f..96bac8d0cb3feff65680edd1b96b46d84a838031 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -72,25 +72,25 @@ static const Costs::Duration kMinComputeTime(1);
 
 namespace {
 
-string GetDataFormat(const OpInfo& op_features) {
+string GetDataFormat(const OpInfo& op_info) {
   string data_format = "NHWC";  // Default format.
-  if (op_features.attr().find("data_format") != op_features.attr().end()) {
-    data_format = op_features.attr().at("data_format").s();
+  if (op_info.attr().find("data_format") != op_info.attr().end()) {
+    data_format = op_info.attr().at("data_format").s();
   }
   return data_format;
 }
 
-string GetFilterFormat(const OpInfo& op_features) {
+string GetFilterFormat(const OpInfo& op_info) {
   string filter_format = "HWIO";  // Default format.
-  if (op_features.attr().find("filter_format") != op_features.attr().end()) {
-    filter_format = op_features.attr().at("filter_format").s();
+  if (op_info.attr().find("filter_format") != op_info.attr().end()) {
+    filter_format = op_info.attr().at("filter_format").s();
   }
   return filter_format;
 }
 
-Padding GetPadding(const OpInfo& op_features) {
-  if (op_features.attr().find("padding") != op_features.attr().end() &&
-      op_features.attr().at("padding").s() == "VALID") {
+Padding GetPadding(const OpInfo& op_info) {
+  if (op_info.attr().find("padding") != op_info.attr().end() &&
+      op_info.attr().at("padding").s() == "VALID") {
     return Padding::VALID;
   }
   return Padding::SAME;  // Default padding.
@@ -107,11 +107,11 @@ bool IsTraining(const OpInfo& op_info) {
 // TODO(dyoon): support non-4D tensors in the c ost functions of convolution
 // related ops (Conv, Pool, BatchNorm, and their backprops) and the related
 // helper functions.
-std::vector<int64> GetStrides(const OpInfo& op_features) {
-  if (op_features.attr().find("strides") != op_features.attr().end()) {
-    const auto strides = op_features.attr().at("strides").list().i();
-    CHECK(strides.size() == 4) << "Attr strides is not a length-4 vector: "
-                               << op_features.DebugString();
+std::vector<int64> GetStrides(const OpInfo& op_info) {
+  if (op_info.attr().find("strides") != op_info.attr().end()) {
+    const auto strides = op_info.attr().at("strides").list().i();
+    CHECK(strides.size() == 4)
+        << "Attr strides is not a length-4 vector: " << op_info.DebugString();
     return {strides[0], strides[1], strides[2], strides[3]};
   }
   return {1, 1, 1, 1};
@@ -359,21 +359,21 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
 }
 
 Costs OpLevelCostEstimator::PredictCosts(const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
-  auto it = device_cost_impl_.find(op_features.op());
+  const auto& op_info = op_context.op_info;
+  auto it = device_cost_impl_.find(op_info.op());
   if (it == device_cost_impl_.end()) {
-    if (elementwise_ops_.find(op_features.op()) != elementwise_ops_.end()) {
+    if (elementwise_ops_.find(op_info.op()) != elementwise_ops_.end()) {
       return PredictCwiseOp(op_context);
     }
 
-    VLOG(1) << "Missing accurate estimator for op: " << op_features.op();
+    VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
 
     return PredictCostOfAnUnknownOp(op_context);
   }
 
   std::function<Costs(const OpContext&)> estimator = it->second;
   Costs costs = estimator(op_context);
-  VLOG(1) << "Operation " << op_features.op() << " takes "
+  VLOG(1) << "Operation " << op_info.op() << " takes "
           << costs.execution_time.count() << " ns.";
   return costs;
 }
@@ -430,39 +430,38 @@ DeviceInfo OpLevelCostEstimator::GetDeviceInfo(
 }
 
 Costs OpLevelCostEstimator::PredictCwiseOp(const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
+  const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
   // For unary or binary element-wise operations, op count is the element count
   // of any input. We use the count for the largest input here to be more robust
   // in case that the shape is unknown or partially known for other input.
-  int64 op_count =
-      CalculateLargestInputCount(op_features, &found_unknown_shapes);
+  int64 op_count = CalculateLargestInputCount(op_info, &found_unknown_shapes);
   // If output shape is available, try use the element count calcuated from
   // that.
-  if (op_features.outputs_size() > 0) {
-    op_count =
-        std::max(op_count, CalculateTensorElementCount(op_features.outputs(0),
-                                                       &found_unknown_shapes));
+  if (op_info.outputs_size() > 0) {
+    op_count = std::max(
+        op_count,
+        CalculateTensorElementCount(op_info.outputs(0), &found_unknown_shapes));
   }
   // For binary ops, calculate the output shape possibly resulting from
   // broadcasting.
-  if (op_features.inputs_size() >= 2) {
-    op_count = std::max(op_count,
-                        CwiseOutputElementCount(op_features.inputs(0).shape(),
-                                                op_features.inputs(1).shape()));
+  if (op_info.inputs_size() >= 2) {
+    op_count =
+        std::max(op_count, CwiseOutputElementCount(op_info.inputs(0).shape(),
+                                                   op_info.inputs(1).shape()));
   }
 
   int op_cost = 1;
   bool is_known_elementwise_op = false;
-  auto it = elementwise_ops_.find(op_features.op());
+  auto it = elementwise_ops_.find(op_info.op());
   if (it != elementwise_ops_.end()) {
     op_cost = it->second;
     is_known_elementwise_op = true;
   } else {
-    LOG(WARNING) << "Not a cwise op: " << op_features.op();
+    LOG(WARNING) << "Not a cwise op: " << op_info.op();
   }
 
-  Costs costs = PredictOpCountBasedCost(op_count * op_cost, op_features);
+  Costs costs = PredictOpCountBasedCost(op_count * op_cost, op_info);
   if (found_unknown_shapes || !is_known_elementwise_op) {
     costs.inaccurate = true;
   }
@@ -542,17 +541,17 @@ Costs OpLevelCostEstimator::PredictOpCountBasedCost(
 }
 
 int64 OpLevelCostEstimator::CountConv2DOperations(
-    const OpInfo& op_features, bool* found_unknown_shapes) const {
-  return CountConv2DOperations(op_features, nullptr, found_unknown_shapes);
+    const OpInfo& op_info, bool* found_unknown_shapes) const {
+  return CountConv2DOperations(op_info, nullptr, found_unknown_shapes);
 }
 
 // Helper to translate the positional arguments into named fields.
 OpLevelCostEstimator::ConvolutionDimensions
 OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
     const TensorShapeProto& original_image_shape,
-    const TensorShapeProto& original_filter_shape, const OpInfo& op_features,
+    const TensorShapeProto& original_filter_shape, const OpInfo& op_info,
     bool* found_unknown_shapes) {
-  VLOG(2) << "op features: " << op_features.DebugString();
+  VLOG(2) << "op features: " << op_info.DebugString();
   VLOG(2) << "Original image shape: " << original_image_shape.DebugString();
   VLOG(2) << "Original filter shape: " << original_filter_shape.DebugString();
   auto image_shape =
@@ -563,7 +562,7 @@ OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
   VLOG(2) << "Filter shape: " << filter_shape.DebugString();
 
   int x_index, y_index, channel_index;
-  const string& data_format = GetDataFormat(op_features);
+  const string& data_format = GetDataFormat(op_info);
   if (data_format == "NCHW") {
     x_index = 2;
     y_index = 3;
@@ -574,7 +573,7 @@ OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
     y_index = 2;
     channel_index = 3;
   }
-  const string& filter_format = GetFilterFormat(op_features);
+  const string& filter_format = GetFilterFormat(op_info);
   int filter_x_index, filter_y_index, in_channel_index, out_channel_index;
   if (filter_format == "HWIO") {
     filter_x_index = 0;
@@ -594,8 +593,8 @@ OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
   int64 iz = image_shape.dim(channel_index).size();
   int64 kx = filter_shape.dim(filter_x_index).size();
   int64 ky = filter_shape.dim(filter_y_index).size();
-  std::vector<int64> strides = GetStrides(op_features);
-  const auto padding = GetPadding(op_features);
+  std::vector<int64> strides = GetStrides(op_info);
+  const auto padding = GetPadding(op_info);
   int64 sx = strides[x_index];
   int64 sy = strides[y_index];
   int64 ox = GetOutputSize(ix, kx, sx, padding);
@@ -623,14 +622,13 @@ OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
 }
 
 int64 OpLevelCostEstimator::CountConv2DOperations(
-    const OpInfo& op_features, ConvolutionDimensions* conv_info,
+    const OpInfo& op_info, ConvolutionDimensions* conv_info,
     bool* found_unknown_shapes) const {
-  DCHECK(op_features.op() == kConv2d ||
-         op_features.op() == kDepthwiseConv2dNative)
+  DCHECK(op_info.op() == kConv2d || op_info.op() == kDepthwiseConv2dNative)
       << "Invalid Operation: not Conv2D nor DepthwiseConv2dNative";
 
   ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
-      op_features.inputs(0).shape(), op_features.inputs(1).shape(), op_features,
+      op_info.inputs(0).shape(), op_info.inputs(1).shape(), op_info,
       found_unknown_shapes);
 
   //  in DepthwiseConv2dNative conv_dims.oz is actually the channel depth
@@ -641,7 +639,7 @@ int64 OpLevelCostEstimator::CountConv2DOperations(
   int64 ops = conv_dims.batch;
   ops *= conv_dims.ox * conv_dims.oy;
   ops *= conv_dims.kx * conv_dims.ky;
-  if (op_features.op() == kConv2d) {
+  if (op_info.op() == kConv2d) {
     ops *= conv_dims.iz * conv_dims.oz;
   } else {
     // To ensure output tensor dims to be correct for DepthwiseConv2DNative,
@@ -658,32 +656,32 @@ int64 OpLevelCostEstimator::CountConv2DOperations(
 }
 
 int64 OpLevelCostEstimator::CountMatMulOperations(
-    const OpInfo& op_features, bool* found_unknown_shapes) const {
-  return CountMatMulOperations(op_features, nullptr, found_unknown_shapes);
+    const OpInfo& op_info, bool* found_unknown_shapes) const {
+  return CountMatMulOperations(op_info, nullptr, found_unknown_shapes);
 }
 
 // TODO(nishantpatil): Create separate estimator for Sparse Matmul
 int64 OpLevelCostEstimator::CountMatMulOperations(
-    const OpInfo& op_features, MatMulDimensions* mat_mul,
+    const OpInfo& op_info, MatMulDimensions* mat_mul,
     bool* found_unknown_shapes) const {
   double ops = 0;
 
-  if (op_features.inputs_size() < 2) {
-    LOG(ERROR) << "Need 2 inputs but got " << op_features.inputs_size();
+  if (op_info.inputs_size() < 2) {
+    LOG(ERROR) << "Need 2 inputs but got " << op_info.inputs_size();
     // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
     return 0;
   }
 
-  auto& a_matrix = op_features.inputs(0);
-  auto& b_matrix = op_features.inputs(1);
+  auto& a_matrix = op_info.inputs(0);
+  auto& b_matrix = op_info.inputs(1);
 
   bool transpose_a = false;
   bool transpose_b = false;
 
   double m_dim, n_dim, k_dim, k_dim_b = 0;
 
-  for (const auto& item : op_features.attr()) {
+  for (const auto& item : op_info.attr()) {
     VLOG(1) << "Key:" << item.first
             << " Value:" << SummarizeAttrValue(item.second);
     if (item.first == "transpose_a" && item.second.b() == true)
@@ -735,23 +733,23 @@ int64 OpLevelCostEstimator::CountMatMulOperations(
 }
 
 int64 OpLevelCostEstimator::CountBatchMatMulOperations(
-    const OpInfo& op_features, bool* found_unknown_shapes) const {
-  if (op_features.op() != kBatchMatMul) {
-    LOG(ERROR) << "Invalid Operation: " << op_features.op();
+    const OpInfo& op_info, bool* found_unknown_shapes) const {
+  if (op_info.op() != kBatchMatMul) {
+    LOG(ERROR) << "Invalid Operation: " << op_info.op();
     // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
     return 0;
   }
-  if (op_features.inputs_size() != 2) {
-    LOG(ERROR) << "Expected 2 inputs but got " << op_features.inputs_size();
+  if (op_info.inputs_size() != 2) {
+    LOG(ERROR) << "Expected 2 inputs but got " << op_info.inputs_size();
     // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
     return 0;
   }
 
   double ops = 0;
-  const auto& a_input = op_features.inputs(0);
-  const auto& b_input = op_features.inputs(1);
+  const auto& a_input = op_info.inputs(0);
+  const auto& b_input = op_info.inputs(1);
 
   // BatchMatMul requires inputs of at least matrix shape (rank 2).
   // The two most minor dimensions of each input are matrices that
@@ -801,24 +799,24 @@ int64 OpLevelCostEstimator::CountBatchMatMulOperations(
 
   // Build the MatMul. Note that values are ignored here since we are just
   // counting ops (e.g. only shapes matter).
-  OpInfo matmul_op_features;
-  matmul_op_features.set_op("MatMul");
+  OpInfo matmul_op_info;
+  matmul_op_info.set_op("MatMul");
 
   AttrValue transpose_a;
   transpose_a.set_b(false);
-  if (op_features.attr().find("adj_x") != op_features.attr().end()) {
-    transpose_a.set_b(op_features.attr().at("adj_x").b());
+  if (op_info.attr().find("adj_x") != op_info.attr().end()) {
+    transpose_a.set_b(op_info.attr().at("adj_x").b());
   }
-  (*matmul_op_features.mutable_attr())["transpose_a"] = transpose_a;
+  (*matmul_op_info.mutable_attr())["transpose_a"] = transpose_a;
 
   AttrValue transpose_b;
   transpose_b.set_b(false);
-  if (op_features.attr().find("adj_y") != op_features.attr().end()) {
-    transpose_b.set_b(op_features.attr().at("adj_y").b());
+  if (op_info.attr().find("adj_y") != op_info.attr().end()) {
+    transpose_b.set_b(op_info.attr().at("adj_y").b());
   }
-  (*matmul_op_features.mutable_attr())["transpose_b"] = transpose_b;
+  (*matmul_op_info.mutable_attr())["transpose_b"] = transpose_b;
 
-  OpInfo::TensorProperties* a_matrix = matmul_op_features.add_inputs();
+  OpInfo::TensorProperties* a_matrix = matmul_op_info.add_inputs();
   a_matrix->set_dtype(a_input.dtype());
   TensorShapeProto* a_matrix_shape = a_matrix->mutable_shape();
   for (int i = std::max(0, a_input_shape.dim_size() - matrix_rank);
@@ -826,7 +824,7 @@ int64 OpLevelCostEstimator::CountBatchMatMulOperations(
     *(a_matrix_shape->add_dim()) = a_input_shape.dim(i);
   }
 
-  OpInfo::TensorProperties* b_matrix = matmul_op_features.add_inputs();
+  OpInfo::TensorProperties* b_matrix = matmul_op_info.add_inputs();
   b_matrix->set_dtype(b_input.dtype());
   TensorShapeProto* b_matrix_shape = b_matrix->mutable_shape();
   for (int i = std::max(0, b_input_shape.dim_size() - matrix_rank);
@@ -836,7 +834,7 @@ int64 OpLevelCostEstimator::CountBatchMatMulOperations(
 
   for (int i = 0; i < num_matmuls; ++i) {
     bool matmul_unknown_shapes = false;
-    ops += CountMatMulOperations(matmul_op_features, &matmul_unknown_shapes);
+    ops += CountMatMulOperations(matmul_op_info, &matmul_unknown_shapes);
     *found_unknown_shapes |= matmul_unknown_shapes;
   }
   return ops;
@@ -894,16 +892,16 @@ bool GetTensorShapeProtoFromTensorProto(const TensorProto& tensor_proto,
 
 // TODO(cliffy): Dedup this method and CountConv2DBackpropFilterOperations.
 int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
-    const OpInfo& op_features, ConvolutionDimensions* returned_conv_dims,
+    const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims,
     bool* found_unknown_shapes) const {
   int64 ops = 0;
 
-  DCHECK(op_features.op() == kConv2dBackpropInput ||
-         op_features.op() == kDepthwiseConv2dNativeBackpropInput)
+  DCHECK(op_info.op() == kConv2dBackpropInput ||
+         op_info.op() == kDepthwiseConv2dNativeBackpropInput)
       << "Invalid Operation: not kConv2dBackpropInput nor"
          "kDepthwiseConv2dNativeBackpropInput";
 
-  if (op_features.inputs_size() < 2) {
+  if (op_info.inputs_size() < 2) {
     // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
     return ops;
@@ -911,12 +909,12 @@ int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
 
   TensorShapeProto input_shape;
   bool shape_found = false;
-  if (op_features.inputs(0).has_value()) {
-    const TensorProto& value = op_features.inputs(0).value();
+  if (op_info.inputs(0).has_value()) {
+    const TensorProto& value = op_info.inputs(0).value();
     shape_found = GetTensorShapeProtoFromTensorProto(value, &input_shape);
   }
-  if (!shape_found && op_features.outputs_size() == 1) {
-    input_shape = op_features.outputs(0).shape();
+  if (!shape_found && op_info.outputs_size() == 1) {
+    input_shape = op_info.outputs(0).shape();
     shape_found = true;
   }
   if (!shape_found) {
@@ -929,13 +927,12 @@ int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
   }
 
   ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
-      input_shape, op_features.inputs(1).shape(), op_features,
-      found_unknown_shapes);
+      input_shape, op_info.inputs(1).shape(), op_info, found_unknown_shapes);
 
   ops = conv_dims.batch;
   ops *= conv_dims.ox * conv_dims.oy;
   ops *= conv_dims.kx * conv_dims.ky;
-  if (op_features.op() == kConv2dBackpropInput) {
+  if (op_info.op() == kConv2dBackpropInput) {
     ops *= conv_dims.iz * conv_dims.oz;
   } else {
     // conv_dims always use forward path definition regardless
@@ -944,7 +941,7 @@ int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
   }
   ops *= kOpsPerMac;
 
-  VLOG(1) << "Operations for" << op_features.op() << "  " << ops;
+  VLOG(1) << "Operations for" << op_info.op() << "  " << ops;
 
   if (returned_conv_dims != nullptr) {
     *returned_conv_dims = conv_dims;
@@ -953,23 +950,23 @@ int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
 }
 
 int64 OpLevelCostEstimator::CountConv2DBackpropFilterOperations(
-    const OpInfo& op_features, ConvolutionDimensions* returned_conv_dims,
+    const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims,
     bool* found_unknown_shapes) const {
   int64 ops = 0;
 
-  DCHECK(op_features.op() == kConv2dBackpropFilter ||
-         op_features.op() == kDepthwiseConv2dNativeBackpropFilter)
+  DCHECK(op_info.op() == kConv2dBackpropFilter ||
+         op_info.op() == kDepthwiseConv2dNativeBackpropFilter)
       << "Invalid Operation: not kConv2dBackpropFilter nor"
          "kDepthwiseConv2dNativeBackpropFilter";
 
   TensorShapeProto filter_shape;
   bool shape_found = false;
-  if (op_features.inputs_size() >= 2 && op_features.inputs(1).has_value()) {
-    const TensorProto& value = op_features.inputs(1).value();
+  if (op_info.inputs_size() >= 2 && op_info.inputs(1).has_value()) {
+    const TensorProto& value = op_info.inputs(1).value();
     shape_found = GetTensorShapeProtoFromTensorProto(value, &filter_shape);
   }
-  if (!shape_found && op_features.outputs_size() == 1) {
-    filter_shape = op_features.outputs(0).shape();
+  if (!shape_found && op_info.outputs_size() == 1) {
+    filter_shape = op_info.outputs(0).shape();
     shape_found = true;
   }
   if (!shape_found) {
@@ -981,19 +978,18 @@ int64 OpLevelCostEstimator::CountConv2DBackpropFilterOperations(
     *found_unknown_shapes = true;
   }
 
-  if (op_features.inputs_size() < 1) {
+  if (op_info.inputs_size() < 1) {
     // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
     return ops;
   }
   ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
-      op_features.inputs(0).shape(), filter_shape, op_features,
-      found_unknown_shapes);
+      op_info.inputs(0).shape(), filter_shape, op_info, found_unknown_shapes);
 
   ops = conv_dims.batch;
   ops *= conv_dims.ox * conv_dims.oy;
   ops *= conv_dims.kx * conv_dims.ky;
-  if (op_features.op() == kConv2dBackpropFilter) {
+  if (op_info.op() == kConv2dBackpropFilter) {
     ops *= conv_dims.iz * conv_dims.oz;
   } else {
     // conv_dims always use forward path definition regardless
@@ -1001,7 +997,7 @@ int64 OpLevelCostEstimator::CountConv2DBackpropFilterOperations(
     ops *= conv_dims.oz;
   }
   ops *= kOpsPerMac;
-  VLOG(1) << "Operations for" << op_features.op() << "  " << ops;
+  VLOG(1) << "Operations for" << op_info.op() << "  " << ops;
 
   if (returned_conv_dims != nullptr) {
     *returned_conv_dims = conv_dims;
@@ -1032,9 +1028,9 @@ int64 OpLevelCostEstimator::CalculateTensorSize(
 }
 
 int64 OpLevelCostEstimator::CalculateInputSize(
-    const OpInfo& op_features, bool* found_unknown_shapes) const {
+    const OpInfo& op_info, bool* found_unknown_shapes) const {
   int64 total_input_size = 0;
-  for (auto& input : op_features.inputs()) {
+  for (auto& input : op_info.inputs()) {
     int64 input_size = CalculateTensorSize(input, found_unknown_shapes);
     total_input_size += input_size;
     VLOG(1) << "Input Size: " << input_size
@@ -1044,9 +1040,9 @@ int64 OpLevelCostEstimator::CalculateInputSize(
 }
 
 int64 OpLevelCostEstimator::CalculateLargestInputCount(
-    const OpInfo& op_features, bool* found_unknown_shapes) const {
+    const OpInfo& op_info, bool* found_unknown_shapes) const {
   int64 largest_input_count = 0;
-  for (auto& input : op_features.inputs()) {
+  for (auto& input : op_info.inputs()) {
     int64 input_count =
         CalculateTensorElementCount(input, found_unknown_shapes);
     if (input_count > largest_input_count) {
@@ -1059,10 +1055,10 @@ int64 OpLevelCostEstimator::CalculateLargestInputCount(
 }
 
 int64 OpLevelCostEstimator::CalculateOutputSize(
-    const OpInfo& op_features, bool* found_unknown_shapes) const {
+    const OpInfo& op_info, bool* found_unknown_shapes) const {
   int64 total_output_size = 0;
   // use float as default for calculations
-  for (const auto& output : op_features.outputs()) {
+  for (const auto& output : op_info.outputs()) {
     DataType dt = output.dtype();
     const auto& original_output_shape = output.shape();
     int64 output_size = DataTypeSize(BaseType(dt));
@@ -1080,10 +1076,10 @@ int64 OpLevelCostEstimator::CalculateOutputSize(
 }
 
 Costs OpLevelCostEstimator::PredictConv2D(const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
+  const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
   auto costs = PredictOpCountBasedCost(
-      CountConv2DOperations(op_features, &found_unknown_shapes), op_features);
+      CountConv2DOperations(op_info, &found_unknown_shapes), op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
@@ -1091,12 +1087,12 @@ Costs OpLevelCostEstimator::PredictConv2D(const OpContext& op_context) const {
 
 Costs OpLevelCostEstimator::PredictConv2DBackpropInput(
     const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
+  const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
   auto costs =
       PredictOpCountBasedCost(CountConv2DBackpropInputOperations(
-                                  op_features, nullptr, &found_unknown_shapes),
-                              op_features);
+                                  op_info, nullptr, &found_unknown_shapes),
+                              op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
@@ -1104,12 +1100,12 @@ Costs OpLevelCostEstimator::PredictConv2DBackpropInput(
 
 Costs OpLevelCostEstimator::PredictConv2DBackpropFilter(
     const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
+  const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
   auto costs =
       PredictOpCountBasedCost(CountConv2DBackpropFilterOperations(
-                                  op_features, nullptr, &found_unknown_shapes),
-                              op_features);
+                                  op_info, nullptr, &found_unknown_shapes),
+                              op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
@@ -1204,26 +1200,26 @@ Costs OpLevelCostEstimator::PredictFusedConv2DBiasActivation(
 }
 
 Costs OpLevelCostEstimator::PredictMatMul(const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
+  const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
   auto costs = PredictOpCountBasedCost(
-      CountMatMulOperations(op_features, &found_unknown_shapes), op_features);
+      CountMatMulOperations(op_info, &found_unknown_shapes), op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
 }
 
 Costs OpLevelCostEstimator::PredictNoOp(const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
-  VLOG(1) << "Op:" << op_features.op() << " Execution Time 0 (ns)";
+  const auto& op_info = op_context.op_info;
+  VLOG(1) << "Op:" << op_info.op() << " Execution Time 0 (ns)";
   return Costs::ZeroCosts();
 }
 
 Costs OpLevelCostEstimator::PredictIdentity(const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
-  VLOG(1) << "Op:" << op_features.op() << " Execution Time 0 (ns)";
+  const auto& op_info = op_context.op_info;
+  VLOG(1) << "Op:" << op_info.op() << " Execution Time 0 (ns)";
   Costs result = Costs::ZeroCosts();
-  result.max_memory = CalculateOutputSize(op_features, &result.inaccurate);
+  result.max_memory = CalculateOutputSize(op_info, &result.inaccurate);
   result.num_ops_with_unknown_shapes = result.inaccurate;
   // Assign the minimum amount of time we can represent to the identity op since
   // it tends to be really cheap.
@@ -1233,11 +1229,10 @@ Costs OpLevelCostEstimator::PredictIdentity(const OpContext& op_context) const {
 }
 
 Costs OpLevelCostEstimator::PredictVariable(const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
-  VLOG(1) << "Op:" << op_features.op() << " Execution Time 0 (ns)";
+  const auto& op_info = op_context.op_info;
+  VLOG(1) << "Op:" << op_info.op() << " Execution Time 0 (ns)";
   Costs result = Costs::ZeroCosts();
-  result.persistent_memory =
-      CalculateOutputSize(op_features, &result.inaccurate);
+  result.persistent_memory = CalculateOutputSize(op_info, &result.inaccurate);
   result.num_ops_with_unknown_shapes = result.inaccurate;
 
   result.compute_time = kMinComputeTime;
@@ -1247,20 +1242,19 @@ Costs OpLevelCostEstimator::PredictVariable(const OpContext& op_context) const {
 
 Costs OpLevelCostEstimator::PredictBatchMatMul(
     const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
+  const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
   Costs costs = PredictOpCountBasedCost(
-      CountBatchMatMulOperations(op_features, &found_unknown_shapes),
-      op_features);
+      CountBatchMatMulOperations(op_info, &found_unknown_shapes), op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
 }
 
 Costs OpLevelCostEstimator::PredictMetadata(const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
+  const auto& op_info = op_context.op_info;
   Costs costs = Costs::ZeroCosts();
-  costs.max_memory = CalculateOutputSize(op_features, &costs.inaccurate);
+  costs.max_memory = CalculateOutputSize(op_info, &costs.inaccurate);
   costs.num_ops_with_unknown_shapes = costs.inaccurate;
   // Metadata operations are so cheap we assume they take the minimum amount of
   // time we can represent (1 ns).
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index 84dd9213f773b538db71f0999c7ffd0b34e1881c..f8ba8c6637d9aade6610a6af8dd6c9f3e0be01af 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -16,10 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_
 #define TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_
 
-#include <functional>
-#include <map>
-#include <string>
-
 #include "tensorflow/core/grappler/costs/cost_estimator.h"
 #include "tensorflow/core/grappler/costs/op_context.h"
 #include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
@@ -79,24 +75,23 @@ class OpLevelCostEstimator {
     int64 sy;         // Stride y.
     Padding padding;  // SAME or VALID.
   };
-  int64 CountConv2DOperations(const OpInfo& op_features,
+  int64 CountConv2DOperations(const OpInfo& op_info,
                               bool* found_unknown_shapes) const;
-  int64 CountConv2DOperations(const OpInfo& op_features,
+  int64 CountConv2DOperations(const OpInfo& op_info,
                               ConvolutionDimensions* conv_info,
                               bool* found_unknown_shapes) const;
-  int64 CountMatMulOperations(const OpInfo& op_features,
+  int64 CountMatMulOperations(const OpInfo& op_info,
                               bool* found_unknown_shapes) const;
-  int64 CountMatMulOperations(const OpInfo& op_features,
-                              MatMulDimensions* mat_mul,
+  int64 CountMatMulOperations(const OpInfo& op_info, MatMulDimensions* mat_mul,
                               bool* found_unknown_shapes) const;
-  int64 CountBatchMatMulOperations(const OpInfo& op_features,
+  int64 CountBatchMatMulOperations(const OpInfo& op_info,
                                    bool* found_unknown_shapes) const;
-  int64 CountConv2DBackpropInputOperations(const OpInfo& op_features,
-                                           ConvolutionDimensions* conv_info,
-                                           bool* found_unknown_shapes) const;
-  int64 CountConv2DBackpropFilterOperations(const OpInfo& op_features,
-                                            ConvolutionDimensions* conv_info,
-                                            bool* found_unknown_shapes) const;
+  int64 CountConv2DBackpropInputOperations(
+      const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims,
+      bool* found_unknown_shapes) const;
+  int64 CountConv2DBackpropFilterOperations(
+      const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims,
+      bool* found_unknown_shapes) const;
 
   // Calculate the element count of an input/output tensor.
   int64 CalculateTensorElementCount(const OpInfo::TensorProperties& tensor,
@@ -108,17 +103,17 @@ class OpLevelCostEstimator {
 
   // Calculate the element count of the largest
   // input of specified TensorFlow op.
-  int64 CalculateLargestInputCount(const OpInfo& op_features,
+  int64 CalculateLargestInputCount(const OpInfo& op_info,
                                    bool* found_unknown_shapes) const;
 
   // Calculate the total size in bytes of the all
   // the inputs of specified TensorFlow op.
-  int64 CalculateInputSize(const OpInfo& op_features,
+  int64 CalculateInputSize(const OpInfo& op_info,
                            bool* found_unknown_shapes) const;
 
   // Calculate the total size in bytes of the all
   // the outputs of specified TensorFlow op.
-  int64 CalculateOutputSize(const OpInfo& op_features,
+  int64 CalculateOutputSize(const OpInfo& op_info,
                             bool* found_unknown_shapes) const;
 
   // This family of routines predicts the costs to
@@ -205,4 +200,5 @@ class OpLevelCostEstimator {
 
 }  // end namespace grappler
 }  // end namespace tensorflow
+
 #endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index 9a59877ac51c850ec59caad61db9d999cb0e17bb..6a9bf13b93b775eb44df5a8c117564a9d82648c1 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -29,8 +29,8 @@ namespace grappler {
 
 namespace {
 // Wrangles the minimum number of proto fields to set up a matrix.
-void DescribeMatrix(int rows, int columns, OpInfo* op_features) {
-  auto input = op_features->add_inputs();
+void DescribeMatrix(int rows, int columns, OpInfo* op_info) {
+  auto input = op_info->add_inputs();
   auto shape = input->mutable_shape();
   auto shape_rows = shape->add_dim();
   shape_rows->set_size(rows);
@@ -39,8 +39,8 @@ void DescribeMatrix(int rows, int columns, OpInfo* op_features) {
   input->set_dtype(DT_FLOAT);
 }
 
-void SetCpuDevice(OpInfo* op_features) {
-  auto device = op_features->mutable_device();
+void SetCpuDevice(OpInfo* op_info) {
+  auto device = op_info->mutable_device();
   device->set_type("CPU");
   device->set_num_cores(10);
   device->set_bandwidth(10000000);  // 10000000 KB/s = 10 GB/s
@@ -413,15 +413,14 @@ class OpLevelCostEstimatorTest : public ::testing::Test {
     return estimator_.PredictCosts(op_context);
   }
 
-  int64 CountMatMulOperations(const OpInfo& op_features,
+  int64 CountMatMulOperations(const OpInfo& op_info,
                               bool* found_unknown_shapes) const {
-    return estimator_.CountMatMulOperations(op_features, found_unknown_shapes);
+    return estimator_.CountMatMulOperations(op_info, found_unknown_shapes);
   }
 
-  int64 CountBatchMatMulOperations(const OpInfo& op_features,
+  int64 CountBatchMatMulOperations(const OpInfo& op_info,
                                    bool* found_unknown_shapes) const {
-    return estimator_.CountBatchMatMulOperations(op_features,
-                                                 found_unknown_shapes);
+    return estimator_.CountBatchMatMulOperations(op_info, found_unknown_shapes);
   }
 
   void SetComputeMemoryOverlap(bool value) {
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index ae5200b359232153f96c9ffa21a505d2a056d55d..0aac0348b512d2e8040a9ac1337ceb9c12a09206 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/costs/virtual_scheduler.h"
 
-#include <math.h>
-
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -38,6 +36,12 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
+// Optional attribute name for Switch op as a vector of int that tells
+// which branch the Switch output is taken on every round of execution.
+// We use this side information, if provided, for scheduling ops after Switch
+// correctly (e.g., While loop).
+constexpr char kOutputSlots[] = "_output_slot_vector";
+
 Costs CombineCosts(const Costs& left, const Costs& right) {
   CHECK_NE(left.max_memory, kMemoryUnknown);
   CHECK_NE(left.max_per_op_buffers, kMemoryUnknown);
@@ -306,43 +310,25 @@ ReadyNodeManager* VirtualScheduler::ReadyNodeManagerFactory(
   LOG(FATAL) << "Not a valid ready node manager: " << ready_node_manager;
 }
 
-VirtualScheduler::VirtualScheduler(const GrapplerItem* grappler_item,
-                                   const bool use_static_shapes,
-                                   Cluster* cluster,
-                                   ReadyNodeManager* ready_nodes)
-    : ready_nodes_(ready_nodes),
-      graph_costs_(Costs::ZeroCosts()),
-      graph_properties_(new GraphProperties(*grappler_item)),
-      cluster_(cluster),
-      grappler_item_(grappler_item),
-      use_static_shapes_(use_static_shapes),
-      placer_(cluster) {
-  graph_costs_.num_ops_total = 0;
-  initialized_ = false;
-}
-
 VirtualScheduler::VirtualScheduler(const bool use_static_shapes,
+                                   const bool use_aggressive_shape_inference,
                                    Cluster* cluster,
                                    ReadyNodeManager* ready_nodes)
     : ready_nodes_(ready_nodes),
       graph_costs_(Costs::ZeroCosts()),
       cluster_(cluster),
       use_static_shapes_(use_static_shapes),
+      use_aggressive_shape_inference_(use_aggressive_shape_inference),
       placer_(cluster) {
   graph_costs_.num_ops_total = 0;
   initialized_ = false;
+  track_mem_usage_snapshot_ = VLOG_IS_ON(1);
 }
 
 Status VirtualScheduler::Init(const GrapplerItem* item) {
   grappler_item_ = item;
   graph_properties_ = absl::make_unique<GraphProperties>(*item);
 
-  return Init();
-}
-
-// TODO(pcma): Merge with Init(const GrapplerItem* item) when this
-// deprecated API is deleted
-Status VirtualScheduler::Init() {
   initialized_ = false;
 
   // Clear all internal states so that the VirtualScheduler is reusable for
@@ -366,7 +352,8 @@ Status VirtualScheduler::Init() {
 
   // Construct graph properties.
   if (use_static_shapes_) {
-    TF_RETURN_IF_ERROR(graph_properties_->InferStatically(true));
+    TF_RETURN_IF_ERROR(graph_properties_->InferStatically(
+        true, use_aggressive_shape_inference_));
   } else {
     TF_RETURN_IF_ERROR(graph_properties_->InferDynamically(cluster_));
   }
@@ -400,6 +387,8 @@ Status VirtualScheduler::Init() {
     name_to_node[node->name()] = node;
   }
 
+  // Traverse the graph to check if the graph is annotated with Switch outputs.
+  // Also record _Send nodes.
   // TODO(dyoon): Instead of identifying _Send node here manually, add _Send
   // to _Recv as control dependency when creating GrapplerItem.
   std::unordered_map<string, const NodeDef*> name_to_send;
@@ -408,6 +397,11 @@ Status VirtualScheduler::Init() {
       const auto& attr = node.attr();
       name_to_send[attr.at("tensor_name").s()] = &node;
     }
+
+    if (IsSwitch(node)) {
+      const auto& attr = node.attr();
+      if (attr.count(kOutputSlots) > 0) switch_outputs_annotated_ = true;
+    }
   }
 
   // To reuse _Recv ops.
@@ -562,7 +556,7 @@ void VirtualScheduler::MaybeUpdateInputOutput(const NodeDef* node) {
       inputs.push_back(control_message);
       outputs.push_back(control_message);
     } else {
-      auto output_properties =
+      const auto& output_properties =
           graph_properties_->GetOutputProperties(NodeName(input_source_name));
       // Like with HasInputProperties, if a node does not have output
       // properties, it's likely it was pruned during the shape inference run.
@@ -769,6 +763,82 @@ Costs& VirtualScheduler::FindOrCreateZero(const string& op_name,
   return it->second;
 }
 
+// Check Switch outputs in updated MetaGraphDef, add corresponding nodes to
+// ready queue.
+// Fallback to add all outputs if fail to find the actual output.
+bool VirtualScheduler::AddSwitchOutputsToReadyQueue(
+    const NodeDef* node, int curr_iter, const Costs::Duration& curr_time) {
+  if (node->attr().count(kOutputSlots) == 0) return false;
+
+  auto& node_state = node_map_[node];
+  const auto& slot_vector = node->attr().at(kOutputSlots);
+  if (slot_vector.list().i_size() <= curr_iter) {
+    // Sometimes we encounter infinite loop. Fall back to add all outputs.
+    return false;
+  }
+
+  int slot = slot_vector.list().i(curr_iter);
+  for (const auto& port_num_output_pair : node_state.outputs) {
+    if (port_num_output_pair.first != slot) continue;
+
+    for (auto* output_node : port_num_output_pair.second) {
+      auto& output_state = node_map_[output_node];
+      output_state.num_inputs_ready++;
+      // Execute a node as soon as all its inputs are ready. Merge nodes
+      // are special since they run as soon as one of their inputs becomes
+      // available.
+      if (output_state.num_inputs_ready == output_state.inputs.size() ||
+          IsMerge(*output_node)) {
+        // This output node is now ready.
+        output_state.time_ready = curr_time;
+        ready_nodes_->AddNode(output_node);
+        VLOG(3) << "Node " << node->name() << " iter " << curr_iter << "/"
+                << slot_vector.list().i_size() << " Add Switch output " << slot
+                << ": " << output_node->name();
+      }
+    }
+    return true;
+  }
+
+  return false;
+}
+
+void VirtualScheduler::AddOutputNodesToReadyQueue(
+    const NodeDef* node, const Costs::Duration& curr_time) {
+  auto& node_state = node_map_[node];
+  int curr_iter = node_state.num_executed_times;
+  ++node_state.num_executed_times;
+
+  if (switch_outputs_annotated_) {
+    // If the graph is annotated with StepStats, reset num_inputs_ready so we
+    // can schedule the node multiple times.
+    node_state.num_inputs_ready = 0;
+
+    // For Switch node, get output branch from updated MetaGraphDef.
+    if (IsSwitch(*node) &&
+        AddSwitchOutputsToReadyQueue(node, curr_iter, curr_time))
+      return;
+  }
+
+  // Increment num_inputs_ready of the output nodes and maybe add to ready
+  // nodes.
+  for (const auto& port_num_output_pair : node_state.outputs) {
+    for (auto* output_node : port_num_output_pair.second) {
+      auto& output_state = node_map_[output_node];
+      output_state.num_inputs_ready++;
+      // Execute a node as soon as all its inputs are ready. Merge nodes are
+      // special since they run as soon as one of their inputs becomes
+      // available.
+      if (output_state.num_inputs_ready == output_state.inputs.size() ||
+          IsMerge(*output_node)) {
+        // This output node is now ready.
+        output_state.time_ready = curr_time;
+        ready_nodes_->AddNode(output_node);
+      }
+    }
+  }
+}
+
 bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
   // Update graph_costs_ and per-op costs.
   graph_costs_ = CombineCosts(graph_costs_, node_costs);
@@ -778,13 +848,16 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
   auto& op_cost = FindOrCreateZero(op_name, &op_to_cost_);
   op_cost = CombineCosts(op_cost, node_costs);
 
-  // Also keep track of op counts and costs per op (with their shapes).
-  OpContext op_context = GetCurrNode();
-  string node_description = GetOpDescription(op_context.op_info);
-  op_counts_[node_description] += 1;
-  op_costs_[node_description] =
-      std::make_pair(node_costs.execution_time.asMicroSeconds().count(),
-                     !node_costs.inaccurate);
+  if (VLOG_IS_ON(2)) {
+    // Also keep track of op counts and costs per op (with their shapes).
+    OpContext op_context = GetCurrNode();
+
+    string node_description = GetOpDescription(op_context.op_info);
+    op_counts_[node_description] += 1;
+    op_costs_[node_description] =
+        std::make_pair(node_costs.execution_time.asMicroSeconds().count(),
+                       !node_costs.inaccurate);
+  }
 
   // Update node and device states.
   auto& node_state = node_map_[node];
@@ -793,6 +866,10 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
   // Node is scheduled when the device is available AND all the inputs are
   // ready; hence, time_scheduled is time_ready if time_ready > device curr
   // time.
+  // TODO(andiryxu): Current node_state result only records the last execution.
+  // With annotated MetaGraph we can schedule a node for multiple times.
+  // Refine NodeState structure accordingly, e.g. record time_scheduled in a
+  // vector.
   node_state.time_scheduled =
       std::max(device.GetCurrTime(), node_state.time_ready);
   // Override device curr time with the time_scheduled.
@@ -826,22 +903,8 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
           << ", scheduled: " << node_state.time_scheduled.count()
           << ", finished: " << node_state.time_finished.count();
 
-  // Increment num_inputs_ready of the output nodes and maybe add to ready nodes
-  for (const auto& port_num_output_pair : node_state.outputs) {
-    for (auto* output_node : port_num_output_pair.second) {
-      auto& output_state = node_map_[output_node];
-      output_state.num_inputs_ready++;
-      // Execute a node as soon as all its inputs are ready. Merge nodes are
-      // special since they run as soon as one of their inputs becomes
-      // available.
-      if (output_state.num_inputs_ready == output_state.inputs.size() ||
-          IsMerge(*output_node)) {
-        // This output node is now ready.
-        output_state.time_ready = curr_time;
-        ready_nodes_->AddNode(output_node);
-      }
-    }
-  }
+  // Check outputs, add ready nodes to queue.
+  AddOutputNodesToReadyQueue(node, curr_time);
 
   // Increment num_outputs_executed of the input nodes and maybe update memory.
   for (const auto& input_port : node_state.inputs) {
@@ -868,7 +931,10 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
     // check max memory usage.
     if (device.memory_usage > device.max_memory_usage) {
       device.max_memory_usage = device.memory_usage;
-      device.mem_usage_snapshot_at_peak = device.nodes_in_memory;
+
+      if (track_mem_usage_snapshot_) {
+        device.mem_usage_snapshot_at_peak = device.nodes_in_memory;
+      }
     }
   }
 
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 6a835f32d16d0850c06891f656b2bec910e26b78..d96371bcab5db2d3ef730bf1eec8fe7f733bf4f6 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -70,11 +70,15 @@ struct NodeState {
   // Each output port uses up memory space from time_scheduled to its
   // time_no_references.
 
+  // How many times this node has been executed, e.g. in a while loop.
+  int num_executed_times;
+
   NodeState() {
     num_inputs_ready = 0;
     time_ready = Costs::Duration::max();
     time_scheduled = Costs::Duration::max();
     time_finished = Costs::Duration::max();
+    num_executed_times = 0;
     // Note that num_outputs_executed and time_no_references are not initialized
     // here, since we don't know the size (i.e., # outputs for this node).
   }
@@ -256,16 +260,9 @@ std::unique_ptr<ReadyNodeManager> ReadyNodeManagerFactory(
 // dependencies, device, etc.
 class VirtualScheduler {
  public:
-  // TODO(pcma): Modify power_analyzer.cc to use new API's.
-  // DEPRECATED
-  VirtualScheduler(const GrapplerItem* grappler_item,
-                   const bool use_static_shapes, Cluster* cluster,
-                   ReadyNodeManager* ready_nodes);
-  // DEPRECATED
-  Status Init();
-
   // Does not take ownership of cluster or ready_nodes.
-  VirtualScheduler(bool use_static_shapes, Cluster* cluster,
+  VirtualScheduler(const bool use_static_shapes,
+                   const bool use_aggressive_shape_inference, Cluster* cluster,
                    ReadyNodeManager* ready_nodes);
   // Initializes the scheduler for the specific grappler item.
   // Should be called immediately after the c'tor or when the scheduler will be
@@ -305,6 +302,8 @@ class VirtualScheduler {
     return &node_map_;
   }
 
+  void enable_mem_usage_tracking() { track_mem_usage_snapshot_ = true; }
+
  private:
   // Constants.
   const string kAttrInputSrc = "input_source_";
@@ -328,6 +327,10 @@ class VirtualScheduler {
                           std::map<string, Costs>* op_cost);
   float Round2(const float x) const;
   bool IsPersistentNode(const NodeDef* node) const;
+  bool AddSwitchOutputsToReadyQueue(const NodeDef* node, int curr_iter,
+                                    const Costs::Duration& curr_time);
+  void AddOutputNodesToReadyQueue(const NodeDef* node,
+                                  const Costs::Duration& curr_time);
 
   // Scheduler states:
   ReadyNodeManager* ready_nodes_;  // Not owned.
@@ -356,6 +359,12 @@ class VirtualScheduler {
   const GrapplerItem* grappler_item_;  // Not owned.
   bool use_static_shapes_;
   bool initialized_;
+  bool track_mem_usage_snapshot_;
+  const bool use_aggressive_shape_inference_;
+
+  // Whether the input graph includes Switch nodes annotated with output slots
+  // information.
+  bool switch_outputs_annotated_ = false;
 
   VirtualPlacer placer_;  // owned.
 };
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
index 0a695458e17a576ecda631b576d4ace4aa947dbc..128cb986f11ba4f4bb13583cb293183194e1c744 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
@@ -30,8 +30,13 @@ namespace grappler {
 // Class for testing virtual scheduler.
 class TestVirtualScheduler : public VirtualScheduler {
  public:
-  TestVirtualScheduler(const bool use_static_shapes, Cluster* cluster)
-      : VirtualScheduler(use_static_shapes, cluster, &ready_node_manager_) {}
+  TestVirtualScheduler(const bool use_static_shapes,
+                       const bool use_aggressive_shape_inference,
+                       Cluster* cluster)
+      : VirtualScheduler(use_static_shapes, use_aggressive_shape_inference,
+                         cluster, &ready_node_manager_) {
+    enable_mem_usage_tracking();
+  }
 
   FRIEND_TEST(VirtualSchedulerTest, MemoryUsage);
   FRIEND_TEST(VirtualSchedulerTest, ControlDependency);
@@ -66,7 +71,8 @@ class VirtualSchedulerTest : public ::testing::Test {
     devices[kCPU1] = cpu_device;
     cluster_ = absl::make_unique<VirtualCluster>(devices);
     scheduler_ = absl::make_unique<TestVirtualScheduler>(
-        /* use_static_shapes = */ true, cluster_.get());
+        /*use_static_shapes=*/true,
+        /*use_aggressive_shape_inference=*/true, cluster_.get());
   }
 
   NodeDef node1_, node2_, node3_, node4_, node5_, node6_;
@@ -867,6 +873,439 @@ versions {
     grappler_item_->fetch = {"while/Exit", "while/Exit_1"};
   }
 
+  // A simple while loop strengthened with Switch outputs.
+  void CreateGrapplerItemWithLoopSwitchOutputs() {
+    // Test graph produced in python using:
+    /*
+      with tf.Graph().as_default():
+      i0 = tf.constant(0)
+      m0 = tf.ones([2, 2])
+      c = lambda i, m: i < 10
+      b = lambda i, m: [i+1, tf.concat([m, m], axis=0)]
+      r = tf.while_loop(
+      c, b, loop_vars=[i0, m0],
+      shape_invariants=[i0.get_shape(), tf.TensorShape([None, 2])])
+      with open('/tmp/graph.pbtxt', 'w') as f:
+      f.write(str(tf.get_default_graph().as_graph_def()))
+    */
+    const string gdef_ascii = R"EOF(
+node {
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "ones"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 2
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "while/Enter"
+  op: "Enter"
+  input: "Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/Enter_1"
+  op: "Enter"
+  input: "ones"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/Merge"
+  op: "Merge"
+  input: "while/Enter"
+  input: "while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Merge_1"
+  op: "Merge"
+  input: "while/Enter_1"
+  input: "while/NextIteration_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/Less/y"
+  op: "Const"
+  input: "^while/Merge"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "while/Less"
+  op: "Less"
+  input: "while/Merge"
+  input: "while/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/LoopCond"
+  op: "LoopCond"
+  input: "while/Less"
+}
+node {
+  name: "while/Switch"
+  op: "Switch"
+  input: "while/Merge"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge"
+      }
+    }
+  }
+  attr {
+    key: "_output_slot_vector"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/Switch_1"
+  op: "Switch"
+  input: "while/Merge_1"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_slot_vector"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/Identity"
+  op: "Identity"
+  input: "while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Identity_1"
+  op: "Identity"
+  input: "while/Switch_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/add/y"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "while/add"
+  op: "Add"
+  input: "while/Identity"
+  input: "while/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/concat/axis"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/concat"
+  op: "ConcatV2"
+  input: "while/Identity_1"
+  input: "while/Identity_1"
+  input: "while/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/NextIteration"
+  op: "NextIteration"
+  input: "while/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/NextIteration_1"
+  op: "NextIteration"
+  input: "while/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/Exit"
+  op: "Exit"
+  input: "while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Exit_1"
+  op: "Exit"
+  input: "while/Switch_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+versions {
+  producer: 21
+}
+  )EOF";
+
+    grappler_item_.reset(new GrapplerItem);
+    CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii,
+                                                &grappler_item_->graph));
+    grappler_item_->id = "test_graph";
+    grappler_item_->fetch = {"while/Exit", "while/Exit_1"};
+  }
+
+  // Create a FusedBatchNorm op that has multiple output ports.
   void CreateGrapplerItemWithInterDeviceTransfers() {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice(kCPU0);
 
@@ -1940,6 +2379,89 @@ TEST_F(VirtualSchedulerTest, WhileLoop) {
   ValidateDependencyChain(start_times, {"while/Switch_1", "while/Exit_1"});
 }
 
+TEST_F(VirtualSchedulerTest, WhileLoopWithSwitchOutputs) {
+  // Init.
+  CreateGrapplerItemWithLoopSwitchOutputs();
+  InitScheduler();
+
+  // Runs the scheduler.
+  RunScheduler("");
+
+  RunMetadata metadata;
+  scheduler_->Summary(&metadata);
+
+  // Nodes in topological order:
+  // * const, ones
+  // * while/Enter, while/Enter_1
+  // * while/Merge, while/Merge_1
+  // * while/Less/y
+  // * while/Less
+  // * while/LoopCond
+  // * while/Switch, while/Switch_1
+  // * while/Identity, while/Identity_1, while/Exit, while/Exit_1
+  // * while/add/y, while/concat/axis
+  // * while/add, while/concat
+  // * while/NextIteration, while/NextIteration_1
+
+  int num_next_iteration = 0;
+  int num_next_iteration_1 = 0;
+  int num_exit = 0;
+  int num_exit_1 = 0;
+  int64 next_iter_start_micro;
+  int64 next_iter_1_start_micro;
+  int64 exit_start_micro;
+  int64 exit_1_start_micro;
+
+  std::unordered_map<string, int64> start_times;
+  for (const auto& device_step_stats : metadata.step_stats().dev_stats()) {
+    for (const auto& stats : device_step_stats.node_stats()) {
+      start_times[stats.node_name()] = stats.all_start_micros();
+      if (stats.node_name() == "while/NextIteration") {
+        ++num_next_iteration;
+        next_iter_start_micro = stats.all_start_micros();
+      } else if (stats.node_name() == "while/NextIteration_1") {
+        ++num_next_iteration_1;
+        next_iter_1_start_micro = stats.all_start_micros();
+      } else if (stats.node_name() == "while/Exit") {
+        ++num_exit;
+        exit_start_micro = stats.all_start_micros();
+      } else if (stats.node_name() == "while/Exit_1") {
+        ++num_exit_1;
+        exit_1_start_micro = stats.all_start_micros();
+      }
+    }
+  }
+
+  // Makes sure we run the loop body for ten times.
+  EXPECT_EQ(10, num_next_iteration);
+  EXPECT_EQ(10, num_next_iteration_1);
+  EXPECT_EQ(1, num_exit);
+  EXPECT_EQ(1, num_exit_1);
+
+  // Start times of while/NextIteration and while/NextIteration_1 should be
+  // different, so should be those of while/Exit and while/Exit_1.
+  EXPECT_NE(next_iter_start_micro, next_iter_1_start_micro);
+  EXPECT_NE(exit_start_micro, exit_1_start_micro);
+
+  // Checks dependency among the nodes; no matter what scheduling mechanism we
+  // use, the scheduled ops should follow these dependency chains.
+  // We have to break the loop into two parts, identified by Switch outputs.
+  ValidateDependencyChain(
+      start_times,
+      {"Const", "while/Enter", "while/Merge", "while/Less/y", "while/Less",
+       "while/LoopCond", "while/Switch", "while/Exit"});
+  ValidateDependencyChain(start_times, {"while/Identity", "while/add/y",
+                                        "while/add", "while/NextIteration"});
+  ValidateDependencyChain(
+      start_times, {"ones", "while/Enter_1", "while/Merge_1", "while/Switch_1",
+                    "while/Exit_1"});
+  ValidateDependencyChain(start_times, {"while/Identity_1", "while/concat",
+                                        "while/NextIteration_1"});
+  ValidateDependencyChain(
+      start_times, {"while/Identity", "while/concat/axis", "while/concat"});
+  ValidateDependencyChain(start_times, {"while/Identity", "while/add"});
+}
+
 TEST_F(VirtualSchedulerTest, InterDeviceTransfer) {
   // Init.
   CreateGrapplerItemWithInterDeviceTransfers();
diff --git a/tensorflow/core/grappler/graph_topology_view.cc b/tensorflow/core/grappler/graph_topology_view.cc
new file mode 100644
index 0000000000000000000000000000000000000000..38ccfbaeb88fc9a21f83ca86482a75e9187ab382
--- /dev/null
+++ b/tensorflow/core/grappler/graph_topology_view.cc
@@ -0,0 +1,163 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/graph_topology_view.h"
+
+#include <algorithm>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+
+template <typename T>
+inline void SortAndRemoveDuplicates(T* v) {
+  std::sort(v->begin(), v->end());
+  v->erase(std::unique(v->begin(), v->end()), v->end());
+}
+
+}  // namespace
+
+Status GraphTopologyView::InitializeFromGraph(
+    const GraphDef& graph,
+    const absl::Span<const GraphView::Edge> ephemeral_edges) {
+  if (graph_ != nullptr) {
+    return errors::InvalidArgument("GraphTopologyView is already initialized.");
+  }
+
+  graph_ = &graph;
+  num_nodes_ = graph.node_size();
+  index_to_node_name_.resize(num_nodes_);
+  node_name_to_index_.rehash(num_nodes_);
+  fanins_.resize(num_nodes_);
+  fanouts_.resize(num_nodes_);
+
+  // Build map from name to index and vice versa.
+  for (int node_idx = 0; node_idx < num_nodes_; ++node_idx) {
+    const NodeDef& node = graph.node(node_idx);
+    node_name_to_index_.emplace(node.name(), node_idx);
+    index_to_node_name_.emplace_back(node.name());
+  }
+
+  // 1. Add ephemeral edges to the adjacency lists.
+  for (const GraphView::Edge& edge : ephemeral_edges) {
+    const auto src = node_name_to_index_.find(edge.src.node->name());
+    if (src == node_name_to_index_.end()) {
+      return errors::InvalidArgument("Non-existent src node: ",
+                                     edge.src.node->name());
+    }
+    const auto dst = node_name_to_index_.find(edge.dst.node->name());
+    if (dst == node_name_to_index_.end()) {
+      return errors::InvalidArgument("Non-existent dst node: ",
+                                     edge.dst.node->name());
+    }
+    const int src_idx = src->second;
+    const int dst_idx = dst->second;
+    fanins_[dst_idx].push_back(src_idx);
+    fanouts_[src_idx].push_back(dst_idx);
+  }
+
+  // 2. Add graph edges to the adjacency lists.
+  for (int node_idx = 0; node_idx < num_nodes_; ++node_idx) {
+    const NodeDef& node = graph.node(node_idx);
+    fanins_[node_idx].reserve(node.input_size());
+
+    for (const string& input : node.input()) {
+      TensorId tensor = ParseTensorName(input);
+      const auto it = node_name_to_index_.find(tensor.node());
+      if (it == node_name_to_index_.end()) {
+        return errors::InvalidArgument("Non-existent input ", input,
+                                       " for node ", node.name());
+      }
+      const int input_idx = it->second;
+      fanins_[node_idx].push_back(input_idx);
+      fanouts_[input_idx].push_back(node_idx);
+    }
+
+    // Dedup the input list while it's still hot in cache.
+    SortAndRemoveDuplicates(&fanins_[node_idx]);
+  }
+
+  // Dedup outputs for all the graph nodes.
+  for (int node_idx = 0; node_idx < num_nodes_; ++node_idx) {
+    SortAndRemoveDuplicates(&fanouts_[node_idx]);
+  }
+
+  return Status::OK();
+}
+
+Status GraphTopologyView::InitializeFromGraph(const GraphDef& graph) {
+  return InitializeFromGraph(graph, absl::Span<GraphView::Edge>());
+}
+
+bool GraphTopologyView::HasNode(const absl::string_view node_name) const {
+  DCHECK(is_initialized()) << "GraphTopologyView is not initialized";
+  const auto it = node_name_to_index_.find(node_name);
+  return it != node_name_to_index_.end();
+}
+
+const NodeDef* GraphTopologyView::GetNode(
+    const absl::string_view node_name) const {
+  DCHECK(is_initialized()) << "GraphTopologyView is not initialized";
+  const auto it = node_name_to_index_.find(node_name);
+  return it == node_name_to_index_.end() ? nullptr : &graph_->node(it->second);
+}
+
+const NodeDef* GraphTopologyView::GetNode(int node_idx) const {
+  DCHECK(is_initialized()) << "GraphTopologyView is not initialized";
+  DCHECK(node_idx >= 0 && node_idx < num_nodes_) << "node_idx is out of range";
+  return &graph_->node(node_idx);
+}
+
+const absl::optional<int> GraphTopologyView::GetNodeIndex(
+    const absl::string_view node_name) const {
+  DCHECK(is_initialized()) << "GraphTopologyView is not initialized";
+  const auto it = node_name_to_index_.find(node_name);
+  DCHECK(it != node_name_to_index_.end()) << "Node doesn't exist in a graph";
+  return it == node_name_to_index_.end() ? absl::nullopt
+                                         : absl::make_optional(it->second);
+}
+
+const absl::optional<int> GraphTopologyView::GetNodeIndex(
+    const NodeDef& node) const {
+  return GetNodeIndex(node.name());
+}
+
+const absl::InlinedVector<int, 4>& GraphTopologyView::GetFanin(
+    int node_idx) const {
+  DCHECK(is_initialized()) << "GraphTopologyView is not initialized";
+  const bool is_valid_node_idx = node_idx >= 0 && node_idx < num_nodes_;
+  DCHECK(is_valid_node_idx) << "node_idx is out of range";
+  return is_valid_node_idx ? fanins_[node_idx] : empty_fanin_;
+}
+
+const absl::InlinedVector<int, 2>& GraphTopologyView::GetFanout(
+    int node_idx) const {
+  DCHECK(is_initialized()) << "GraphTopologyView is not initialized";
+  const bool is_valid_node_idx = node_idx >= 0 && node_idx < num_nodes_;
+  DCHECK(is_valid_node_idx) << "node_idx is out of range";
+  return is_valid_node_idx ? fanouts_[node_idx] : empty_fanout_;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/graph_topology_view.h b/tensorflow/core/grappler/graph_topology_view.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c222df4b60951eb38b8e24411c4807e4fe4885d
--- /dev/null
+++ b/tensorflow/core/grappler/graph_topology_view.h
@@ -0,0 +1,105 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPH_TOPOLOGY_VIEW_H_
+#define TENSORFLOW_CORE_GRAPPLER_GRAPH_TOPOLOGY_VIEW_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/grappler/graph_view.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// GraphTopologyView is a helper class to simplify `node-to-node` connectivity
+// traversals. Regular `GraphView` simplifies `tensor-to-tensor` traversals:
+// connections between output tensors and inputs of a consumer nodes. For the
+// topology view we are focused on nodes connected to nodes, and it's irrelevant
+// if this connection is formed by one or multiple individual tensors.
+//
+// Example:
+//   a = Placeholder(..)
+//   b = Placeholder(..)
+//   c = AddN([a, a, b])
+//
+// GraphView edges:         [a:0 -> c:0, a:0 -> c:1, b:0 -> c:3]
+// GraphTopologyView edges: [a -> c, b -> c]
+//
+// GraphView is used for exploring single node fanins and fanouts, and
+// GraphTopologyView is focused on efficient full graph traversals (computing
+// graph node properties from transitive fanouts, etc...).
+class GraphTopologyView {
+ public:
+  GraphTopologyView() = default;
+
+  // Initialize graph topology view from the graph. It's possible to pass
+  // additional edges that do not exist in a graph, but must be respected when
+  // computing graph topology. Example: Tensorflow runtime allows concurrent
+  // execution of dequeue/enqueue ops from the same queue resource, but we might
+  // want to enforce ordering between them for the purpose of graph analysis.
+  Status InitializeFromGraph(const GraphDef& graph,
+                             absl::Span<const GraphView::Edge> ephemeral_edges);
+  Status InitializeFromGraph(const GraphDef& graph);
+
+  bool is_initialized() const { return graph_ != nullptr; }
+  int num_nodes() const { return num_nodes_; }
+  const GraphDef* graph() const { return graph_; }
+
+  // Returns true iff the node exists in the underlying graph.
+  bool HasNode(absl::string_view node_name) const;
+
+  // Finds a node by name or returns `nullptr` if it's not in the graph.
+  const NodeDef* GetNode(absl::string_view node_name) const;
+  // Returns a node corresponding to the given node index.
+  const NodeDef* GetNode(int node_idx) const;
+
+  // Returns a node index for the given node name, if the name exists in the
+  // underlying graph. Otherwise returns empty optional.
+  const absl::optional<int> GetNodeIndex(absl::string_view node_name) const;
+  // Returns a node index for the given node, if the node belongs to the
+  // underlying graph. Otherwise returns empty optional.
+  const absl::optional<int> GetNodeIndex(const NodeDef& node) const;
+
+  // Returns all the node indexes that are in the direct fanin of the given
+  // node. If the `node_idx` is outside of [0, num_nodes_) returns empty vector.
+  const absl::InlinedVector<int, 4>& GetFanin(int node_idx) const;
+  // Returns all the node indexes that are in the direct fanout of the given
+  // node. If the `node_idx` is outside of [0, num_nodes_) returns empty vector.
+  const absl::InlinedVector<int, 2>& GetFanout(int node_idx) const;
+
+ private:
+  // WARN: `graph_` must outlive this object and graph nodes must not be
+  // destructed, because node names captured with absl::string_view.
+  const GraphDef* graph_ = nullptr;  // do not own
+  int num_nodes_ = 0;
+  std::vector<absl::string_view> index_to_node_name_;
+  absl::flat_hash_map<absl::string_view, int> node_name_to_index_;
+  std::vector<absl::InlinedVector<int, 4>> fanins_;   // node_idx->input nodes
+  std::vector<absl::InlinedVector<int, 2>> fanouts_;  // node_idx->output nodes
+
+  // We need a valid reference to return from GetFanin/GetFanout if the
+  // `node_idx` argument is outside of the [0, num_nodes_) range.
+  absl::InlinedVector<int, 4> empty_fanin_;
+  absl::InlinedVector<int, 2> empty_fanout_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_GRAPH_TOPOLOGY_VIEW_H_
diff --git a/tensorflow/core/grappler/graph_topology_view_test.cc b/tensorflow/core/grappler/graph_topology_view_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..36d3a2017cc5ef965a26b0bdbbbdde441fb633db
--- /dev/null
+++ b/tensorflow/core/grappler/graph_topology_view_test.cc
@@ -0,0 +1,117 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/graph_topology_view.h"
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class GraphTopologyViewTest : public ::testing::Test {
+ protected:
+  using NodeConfig = std::pair<string, std::vector<string>>;
+
+  static GraphDef CreateGraph(const std::vector<NodeConfig>& nodes) {
+    GraphDef graph;
+
+    for (const NodeConfig& node : nodes) {
+      const auto& node_name = node.first;
+      const auto& node_inputs = node.second;
+
+      NodeDef node_def;
+      node_def.set_name(node_name);
+      for (const string& input : node_inputs) {
+        node_def.add_input(input);
+      }
+
+      *graph.add_node() = std::move(node_def);
+    }
+
+    return graph;
+  }
+};
+
+TEST_F(GraphTopologyViewTest, SimpleGraph) {
+  const GraphDef graph = CreateGraph({
+      {"a", {}},          // idx: 0
+      {"b", {}},          // idx: 1
+      {"c", {"a", "b"}},  // idx: 2
+      {"d", {"a", "c"}},  // idx: 3
+  });
+
+  GraphTopologyView graph_view;
+  TF_CHECK_OK(graph_view.InitializeFromGraph(graph));
+
+  EXPECT_TRUE(graph_view.is_initialized());
+
+  const NodeDef* a_by_name = graph_view.GetNode("a");
+  const NodeDef* a_by_idx = graph_view.GetNode(0);
+  ASSERT_TRUE(a_by_name);
+  ASSERT_TRUE(a_by_idx);
+  EXPECT_EQ(a_by_name, a_by_idx);
+
+  const NodeDef* b_by_name = graph_view.GetNode("b");
+  const NodeDef* b_by_idx = graph_view.GetNode(1);
+  ASSERT_TRUE(b_by_name);
+  ASSERT_TRUE(b_by_idx);
+  EXPECT_EQ(b_by_name, b_by_idx);
+
+  const absl::optional<int> b_idx = graph_view.GetNodeIndex(*b_by_name);
+  ASSERT_TRUE(b_idx.has_value());
+  EXPECT_EQ(b_idx.value(), 1);
+
+  const absl::optional<int> c_idx = graph_view.GetNodeIndex("c");
+  ASSERT_TRUE(c_idx.has_value());
+  EXPECT_EQ(c_idx.value(), 2);
+
+  using Fanin = absl::InlinedVector<int, 4>;
+  EXPECT_EQ(graph_view.GetFanin(0), Fanin());
+  EXPECT_EQ(graph_view.GetFanin(1), Fanin());
+  EXPECT_EQ(graph_view.GetFanin(2), Fanin({0, 1}));
+  EXPECT_EQ(graph_view.GetFanin(3), Fanin({0, 2}));
+
+  using Fanout = absl::InlinedVector<int, 2>;
+  EXPECT_EQ(graph_view.GetFanout(0), Fanout({2, 3}));
+  EXPECT_EQ(graph_view.GetFanout(1), Fanout({2}));
+  EXPECT_EQ(graph_view.GetFanout(2), Fanout({3}));
+  EXPECT_EQ(graph_view.GetFanout(3), Fanout());
+}
+
+TEST_F(GraphTopologyViewTest, GraphWithALoop) {
+  const GraphDef graph = CreateGraph({
+      {"a", {}},               // idx: 0
+      {"b", {}},               // idx: 1
+      {"c", {"a", "b", "d"}},  // idx: 2 <<<--- 'c' and 'd' have a loop
+      {"d", {"a", "c"}},       // idx: 3
+  });
+
+  GraphTopologyView graph_view;
+  TF_CHECK_OK(graph_view.InitializeFromGraph(graph));
+  EXPECT_TRUE(graph_view.is_initialized());
+
+  using Fanin = absl::InlinedVector<int, 4>;
+  EXPECT_EQ(graph_view.GetFanin(2), Fanin({0, 1, 3}));
+  EXPECT_EQ(graph_view.GetFanin(3), Fanin({0, 2}));
+
+  using Fanout = absl::InlinedVector<int, 2>;
+  EXPECT_EQ(graph_view.GetFanout(2), Fanout({3}));
+  EXPECT_EQ(graph_view.GetFanout(3), Fanout({2}));
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/graph_view.cc b/tensorflow/core/grappler/graph_view.cc
index ba9d2eb32181940bc430771db281c6cea8cb48c4..be9b9c36c71c6f8282862de85a211358fa826186 100644
--- a/tensorflow/core/grappler/graph_view.cc
+++ b/tensorflow/core/grappler/graph_view.cc
@@ -66,28 +66,27 @@ int OpInputPortIdToArgId(const NodeDef& node, const OpDef& op, int port_id) {
 bool HasSingleFanoutNode(const GraphView& graph_view, const NodeDef* node,
                          int port) {
   const auto output = GraphView::OutputPort(node, port);
-  const auto fanout = graph_view.GetFanout(output);
-  return fanout.size() <= 1;
+  return graph_view.GetFanout(output).size() <= 1;
 }
 
 bool HasFanouts(const GraphView& graph_view, const NodeDef* node, int port) {
   const auto output = GraphView::OutputPort(node, port);
-  const auto fanout = graph_view.GetFanout(output);
-  return !fanout.empty();
+  return !graph_view.GetFanout(output).empty();
 }
 
-bool NoControlFanin(const GraphView& graph_view, const NodeDef* node) {
-  const auto control_port = GraphView::InputPort(node, -1);
-  return graph_view.GetFanin(control_port).empty();
+bool HasControlFanin(const GraphView& graph_view, const NodeDef* node) {
+  const auto control_port = GraphView::InputPort(node, Graph::kControlSlot);
+  return !graph_view.GetFanin(control_port).empty();
 }
 
-bool NoControlFanout(const GraphView& graph_view, const NodeDef* node) {
-  const auto control_port = GraphView::OutputPort(node, -1);
-  return graph_view.GetFanout(control_port).empty();
+bool HasControlFanout(const GraphView& graph_view, const NodeDef* node) {
+  const auto control_port = GraphView::OutputPort(node, Graph::kControlSlot);
+  return !graph_view.GetFanout(control_port).empty();
 }
 
-bool NoControlFaninOrFanout(const GraphView& graph_view, const NodeDef* node) {
-  return NoControlFanin(graph_view, node) && NoControlFanout(graph_view, node);
+bool HasControlFaninOrFanout(const GraphView& graph_view, const NodeDef* node) {
+  return HasControlFanin(graph_view, node) ||
+         HasControlFanout(graph_view, node);
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h
index 16156d0f2042763a7518d5de2c57440343e50f2d..dc4ab93894c4d85038efa8c3052a06f9e5e55d1d 100644
--- a/tensorflow/core/grappler/graph_view.h
+++ b/tensorflow/core/grappler/graph_view.h
@@ -172,9 +172,8 @@ class GraphViewInternal {
     if (fanin.index() < -1) {
       return false;
     }
-    string fanin_string = TensorIdToString(fanin);
-    for (int i = 0; i < node.input_size(); ++i) {
-      if (node.input(i) == fanin_string) {
+    for (const string& input : node.input()) {
+      if (ParseTensorName(input) == fanin) {
         return true;
       }
     }
@@ -370,10 +369,12 @@ bool HasSingleFanoutNode(const GraphView& graph_view, const NodeDef* node,
 
 // Returns true if node has at least one fanout node at given output port.
 bool HasFanouts(const GraphView& graph_view, const NodeDef* node, int port = 0);
-
-bool NoControlFanin(const GraphView& graph_view, const NodeDef* node);
-bool NoControlFanout(const GraphView& graph_view, const NodeDef* node);
-bool NoControlFaninOrFanout(const GraphView& graph_view, const NodeDef* node);
+// Returns true if the node has at least one input control dependency.
+bool HasControlFanin(const GraphView& graph_view, const NodeDef* node);
+// Returns true if the node has at least one output control dependency.
+bool HasControlFanout(const GraphView& graph_view, const NodeDef* node);
+// Returns true if the node has at least one input or output control dependency.
+bool HasControlFaninOrFanout(const GraphView& graph_view, const NodeDef* node);
 
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index 9224ee7849211f849c3655d6faea18dcc32b8e17..fc55fb5b3d2f905fc0fab837a9345b7e396acd13 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -103,7 +103,12 @@ Status OptimizeGraph(const GraphDef& graph_def_arg, GraphDef* output_graph_def,
 
   // Instantiate all variables for function library runtime creation.
   std::vector<std::unique_ptr<Device>> devices;
-  TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(
+  // Only CPU device is used so instead of calling DeviceFactory::AddDevices()
+  // with dummy session config, which will conflict with user defined options
+  // and create unwanted devices, call cpu_factory->CreateDevices() to get CPU
+  // only devices.
+  DeviceFactory* cpu_factory = DeviceFactory::GetFactory("CPU");
+  TF_RETURN_IF_ERROR(cpu_factory->CreateDevices(
       options, "/job:localhost/replica:0/task:0", &devices));
   Device* cpu_device = devices[0].get();
   std::unique_ptr<DeviceMgr> dvc_mgr(new DeviceMgr(std::move(devices)));
diff --git a/tensorflow/core/grappler/mutable_graph_view.cc b/tensorflow/core/grappler/mutable_graph_view.cc
index ca4d5255c0fb321fa3c744480d7b81f975a02589..a8a90d1a57a34a1d757bccacc5d6abcb4f6db7ef 100644
--- a/tensorflow/core/grappler/mutable_graph_view.cc
+++ b/tensorflow/core/grappler/mutable_graph_view.cc
@@ -39,8 +39,122 @@ bool IsTensorIdPortValid(const TensorId& tensor_id) {
   return tensor_id.index() >= Graph::kControlSlot;
 }
 
+bool IsTensorIdRegular(const TensorId& tensor_id) {
+  return tensor_id.index() > Graph::kControlSlot;
+}
+
+bool IsTensorIdControlling(const TensorId& tensor_id) {
+  return tensor_id.index() == Graph::kControlSlot;
+}
+
+bool IsOutputPortRegular(const MutableGraphView::OutputPort& port) {
+  return port.port_id > Graph::kControlSlot;
+}
+
+bool IsOutputPortControlling(const MutableGraphView::OutputPort& port) {
+  return port.port_id == Graph::kControlSlot;
+}
+
+// Determines if node is an Identity where it's first regular input is a Switch
+// node.
+bool IsIdentityConsumingSwitch(const MutableGraphView& graph,
+                               const NodeDef& node) {
+  if ((IsIdentity(node) || IsIdentityNSingleInput(node)) &&
+      node.input_size() > 0) {
+    TensorId tensor_id = ParseTensorName(node.input(0));
+    if (IsTensorIdControlling(tensor_id)) {
+      return false;
+    }
+
+    NodeDef* input_node = graph.GetNode(tensor_id.node());
+    return IsSwitch(*input_node);
+  }
+  return false;
+}
+
+// Determines if node input can be deduped by regular inputs when used as a
+// control dependency. Specifically, if a node is an Identity that leads to a
+// Switch node, when used as a control dependency, that control dependency
+// should not be deduped even though the same node is used as a regular input.
+bool CanDedupControlWithRegularInput(const MutableGraphView& graph,
+                                     const NodeDef& control_node) {
+  return !IsIdentityConsumingSwitch(graph, control_node);
+}
+
+// Determines if node input can be deduped by regular inputs when used as a
+// control dependency. Specifically, if a node is an Identity that leads to a
+// Switch node, when used as a control dependency, that control dependency
+// should not be deduped even though the same node is used as a regular input.
+bool CanDedupControlWithRegularInput(const MutableGraphView& graph,
+                                     absl::string_view control_node_name) {
+  NodeDef* control_node = graph.GetNode(control_node_name);
+  return CanDedupControlWithRegularInput(graph, *control_node);
+}
+
 }  // namespace
 
+void MutableGraphView::AddAndDedupFanouts(NodeDef* node) {
+  absl::flat_hash_set<absl::string_view> fanins;
+  absl::flat_hash_set<absl::string_view> controlling_fanins;
+  int pos = 0;
+  const int last_idx = node->input_size() - 1;
+  int last_pos = last_idx;
+  while (pos <= last_pos) {
+    TensorId tensor_id = ParseTensorName(node->input(pos));
+    absl::string_view input_node_name = tensor_id.node();
+    bool is_control_input = IsTensorIdControlling(tensor_id);
+    bool can_dedup_control_with_regular_input =
+        CanDedupControlWithRegularInput(*this, input_node_name);
+    bool can_dedup_control =
+        is_control_input && (can_dedup_control_with_regular_input ||
+                             (!can_dedup_control_with_regular_input &&
+                              controlling_fanins.contains(input_node_name)));
+    if (!gtl::InsertIfNotPresent(&fanins, input_node_name) &&
+        can_dedup_control) {
+      node->mutable_input()->SwapElements(pos, last_pos--);
+    } else {
+      OutputPort output(nodes()[input_node_name], tensor_id.index());
+
+      if (is_control_input) {
+        fanouts()[output].emplace(node, Graph::kControlSlot);
+      } else {
+        max_regular_output_port()[output.node] =
+            std::max(max_regular_output_port()[output.node], output.port_id);
+        fanouts()[output].emplace(node, pos);
+      }
+      ++pos;
+    }
+    if (is_control_input) {
+      controlling_fanins.insert(input_node_name);
+    }
+  }
+
+  if (last_pos < last_idx) {
+    node->mutable_input()->DeleteSubrange(last_pos + 1, last_idx - last_pos);
+  }
+}
+
+void MutableGraphView::UpdateMaxRegularOutputPortForRemovedFanin(
+    const OutputPort& fanin,
+    const absl::flat_hash_set<InputPort>& fanin_fanouts) {
+  int max_port = max_regular_output_port()[fanin.node];
+  if (!fanin_fanouts.empty() || max_port != fanin.port_id) {
+    return;
+  }
+  bool updated_max_port = false;
+  for (int i = fanin.port_id - 1; i >= 0; --i) {
+    OutputPort fanin_port(fanin.node, i);
+    if (!fanouts()[fanin_port].empty()) {
+      max_regular_output_port()[fanin.node] = i;
+      updated_max_port = true;
+      break;
+    }
+  }
+  if (!updated_max_port) {
+    max_regular_output_port().erase(fanin.node);
+  }
+}
+
 const absl::flat_hash_set<MutableGraphView::InputPort>&
 MutableGraphView::GetFanout(const GraphView::OutputPort& port) const {
   return GetFanout(MutableGraphView::OutputPort(const_cast<NodeDef*>(port.node),
@@ -65,16 +179,16 @@ NodeDef* MutableGraphView::AddNode(NodeDef&& node) {
 
   AddUniqueNodeOrDie(node_in_graph);
 
-  AddFanouts(node_in_graph);
+  AddAndDedupFanouts(node_in_graph);
   return node_in_graph;
 }
 
-void MutableGraphView::UpdateFanouts(absl::string_view from_node,
+bool MutableGraphView::UpdateFanouts(absl::string_view from_node,
                                      absl::string_view to_node) {
   NodeDef* from_node_ptr = GetNode(from_node);
   NodeDef* to_node_ptr = GetNode(to_node);
   if (from_node_ptr && to_node_ptr) {
-    UpdateFanouts(from_node_ptr, to_node_ptr);
+    return UpdateFanoutsInternal(from_node_ptr, to_node_ptr);
   } else if (!from_node_ptr) {
     LOG(WARNING) << absl::Substitute(
         "Can't update fanouts from '$0' to '$1', from node was not found.",
@@ -84,9 +198,11 @@ void MutableGraphView::UpdateFanouts(absl::string_view from_node,
         "Can't update fanouts from '$0' to '$1', to node was not found.",
         from_node, to_node);
   }
+  return false;
 }
 
-void MutableGraphView::UpdateFanouts(NodeDef* from_node, NodeDef* to_node) {
+bool MutableGraphView::UpdateFanoutsInternal(NodeDef* from_node,
+                                             NodeDef* to_node) {
   VLOG(2) << absl::Substitute("Update fanouts from '$0' to '$1'.",
                               from_node->name(), to_node->name());
 
@@ -112,6 +228,7 @@ void MutableGraphView::UpdateFanouts(NodeDef* from_node, NodeDef* to_node) {
   // input to some other node.
   int keep_max_regular_output_port = -1;
 
+  bool modified = false;
   for (const Edge& edge : regular_edges) {
     const OutputPort output_port = edge.src;
     const InputPort input_port = edge.dst;
@@ -120,7 +237,7 @@ void MutableGraphView::UpdateFanouts(NodeDef* from_node, NodeDef* to_node) {
     // AddAndUpdateFanoutsWithoutSelfLoops test for an example).
     if (input_port.node == to_node) {
       keep_max_regular_output_port =
-          std::max(keep_max_regular_output_port, input_port.port_id);
+          std::max(keep_max_regular_output_port, output_port.port_id);
       continue;
     }
 
@@ -135,6 +252,11 @@ void MutableGraphView::UpdateFanouts(NodeDef* from_node, NodeDef* to_node) {
     remove_edge(output_port, input_port);
     // Add an edge between the `to_node` and new fanout node.
     add_edge(OutputPort(to_node, output_port.port_id), input_port);
+    // Dedup control dependency.
+    if (CanDedupControlWithRegularInput(*this, *to_node)) {
+      RemoveControllingFaninInternal(input_port.node, to_node);
+    }
+    modified = true;
   }
 
   // For the control fanouts we do not know the input index in a NodeDef,
@@ -142,29 +264,15 @@ void MutableGraphView::UpdateFanouts(NodeDef* from_node, NodeDef* to_node) {
 
   auto control_fanouts =
       GetFanout(GraphView::OutputPort(from_node, Graph::kControlSlot));
-  if (control_fanouts.empty()) return;
-
-  const string from_control_input = absl::StrCat("^", from_node->name());
-  const string to_control_input = absl::StrCat("^", to_node->name());
 
   for (const InputPort& control_port : control_fanouts) {
     // Node can't be control dependency of itself.
     if (control_port.node == to_node) continue;
 
-    // Find and update input corresponding to control dependency.
     NodeDef* node = control_port.node;
-    for (int i = node->input_size() - 1; i >= 0; --i) {
-      const string& input = node->input(i);
-      if (!IsControlInput(input)) break;  // we reached regular inputs
-      if (input == from_control_input) {
-        node->set_input(i, to_control_input);
-      }
-    }
-
-    // Remove old edge between the `from_node` and the fanout node.
-    remove_edge(OutputPort(from_node, Graph::kControlSlot), control_port);
-    // Add an edge between the `to_node` and new fanout node.
-    add_edge(OutputPort(to_node, Graph::kControlSlot), control_port);
+    modified |= RemoveControllingFaninInternal(node, from_node);
+    // TODO(lyandy): Handle Switch control dependencies.
+    modified |= AddFaninInternal(node, {to_node, Graph::kControlSlot});
   }
 
   // Because we update all regular fanouts of `from_node`, we can just copy
@@ -177,31 +285,54 @@ void MutableGraphView::UpdateFanouts(NodeDef* from_node, NodeDef* to_node) {
   } else {
     max_regular_output_port().erase(from_node);
   }
+
+  return modified;
 }
 
 bool MutableGraphView::AddFaninInternal(NodeDef* node,
                                         const OutputPort& fanin) {
   int num_non_controlling_fanins =
       NumFanins(*node, /*include_controlling_nodes=*/false);
+  bool input_is_control = IsOutputPortControlling(fanin);
+  bool can_dedup_control_with_regular_input =
+      CanDedupControlWithRegularInput(*this, *fanin.node);
+  // Don't add duplicate control dependencies.
+  if (input_is_control) {
+    const int start =
+        can_dedup_control_with_regular_input ? 0 : num_non_controlling_fanins;
+    for (int i = start; i < node->input_size(); ++i) {
+      if (ParseTensorName(node->input(i)).node() == fanin.node->name()) {
+        return false;
+      }
+    }
+  }
+
   InputPort input;
   input.node = node;
-  input.port_id = fanin.port_id == Graph::kControlSlot
-                      ? Graph::kControlSlot
-                      : num_non_controlling_fanins;
+  input.port_id =
+      input_is_control ? Graph::kControlSlot : num_non_controlling_fanins;
 
-  if (!gtl::InsertIfNotPresent(&fanouts()[fanin], input)) {
-    return false;
-  }
   node->add_input(TensorIdToString({fanin.node->name(), fanin.port_id}));
-  if (fanin.port_id > Graph::kControlSlot) {
-    int node_input_size = node->input_size() - 1;
+  if (IsOutputPortRegular(fanin)) {
+    int last_node_input = node->input_size() - 1;
     // If there are control dependencies in node, move newly inserted fanin to
     // be before such control dependencies.
-    if (num_non_controlling_fanins < node_input_size) {
-      node->mutable_input()->SwapElements(node_input_size,
+    if (num_non_controlling_fanins < last_node_input) {
+      node->mutable_input()->SwapElements(last_node_input,
                                           num_non_controlling_fanins);
     }
   }
+
+  fanouts()[fanin].insert(input);
+  if (max_regular_output_port()[fanin.node] < fanin.port_id) {
+    max_regular_output_port()[fanin.node] = fanin.port_id;
+  }
+
+  // Dedup control dependencies.
+  if (!input_is_control && can_dedup_control_with_regular_input) {
+    RemoveControllingFaninInternal(node, fanin.node);
+  }
+
   return true;
 }
 
@@ -213,9 +344,9 @@ bool MutableGraphView::AddFaninInternal(NodeDef* node, const TensorId& fanin) {
   return AddFaninInternal(node, {fanin_node, fanin.index()});
 }
 
-bool MutableGraphView::AddFanin(absl::string_view node_name,
-                                const TensorId& fanin) {
-  if (!IsTensorIdPortValid(fanin)) {
+bool MutableGraphView::AddRegularFanin(absl::string_view node_name,
+                                       const TensorId& fanin) {
+  if (!IsTensorIdRegular(fanin)) {
     return false;
   }
   NodeDef* node = GetNode(node_name);
@@ -225,58 +356,153 @@ bool MutableGraphView::AddFanin(absl::string_view node_name,
   return AddFaninInternal(node, fanin);
 }
 
-bool MutableGraphView::RemoveFanins(NodeDef* node,
-                                    absl::Span<const TensorId> fanins) {
-  bool modified = false;
-  auto mutable_inputs = node->mutable_input();
-  int curr_pos = 0;
-  int num_inputs = node->input_size();
-  for (int i = 0; i < num_inputs; ++i) {
-    TensorId tensor_id = ParseTensorName(node->input(i));
-    bool remove_fanin =
-        std::find(fanins.begin(), fanins.end(), tensor_id) != fanins.end();
-    bool update_fanin = !remove_fanin && modified;
-    if (remove_fanin || update_fanin) {
-      OutputPort fanin(nodes()[tensor_id.node()], tensor_id.index());
-
-      InputPort input;
-      input.node = node;
-      input.port_id =
-          tensor_id.index() == Graph::kControlSlot ? Graph::kControlSlot : i;
-
-      if (remove_fanin) {
-        fanouts()[fanin].erase(input);
-      } else {
-        // Shift inputs to be retained.
-        if (tensor_id.index() > Graph::kControlSlot) {
-          fanouts()[fanin].erase(input);
-          fanouts()[fanin].insert(InputPort(node, i));
+bool MutableGraphView::AddControllingFanin(absl::string_view node_name,
+                                           const TensorId& fanin) {
+  NodeDef* node = GetNode(node_name);
+  if (node == nullptr) {
+    return false;
+  }
+  NodeDef* fanin_node = GetNode(fanin.node());
+  if (fanin_node == nullptr) {
+    return false;
+  }
+
+  if (!IsSwitch(*fanin_node)) {
+    return AddFaninInternal(node, {fanin_node, Graph::kControlSlot});
+  } else {
+    if (IsTensorIdControlling(fanin)) {
+      // Cannot add a Switch node control dependency.
+      return false;
+    }
+    // We can't anchor control dependencies directly on the switch node: unlike
+    // other nodes only one of the outputs of the switch node will be generated
+    // when the switch node is executed, and we need to make sure the control
+    // dependency is only triggered when the corresponding output is triggered.
+    // We start by looking for an identity node connected to the output of the
+    // switch node, and use it to anchor the control dependency.
+    auto fanouts = GetFanouts(*fanin_node, /*include_controlled_nodes=*/false);
+    for (auto fanout : fanouts) {
+      if (IsIdentity(*fanout.node) || IsIdentityNSingleInput(*fanout.node)) {
+        if (ParseTensorName(fanout.node->input(0)) == fanin) {
+          return AddFaninInternal(node, {fanout.node, Graph::kControlSlot});
         }
-        mutable_inputs->SwapElements(i, curr_pos++);
       }
+    }
+    // We haven't found an existing node where we can anchor the control
+    // dependency: add a new identity node.
+    string ctrl_dep_name = AddPrefixToNodeName(
+        absl::StrCat(fanin.node(), "_", fanin.index()), kMutableGraphViewCtrl);
 
+    // Reuse a previously created node, if possible.
+    NodeDef* ctrl_dep_node = GetNode(ctrl_dep_name);
+    if (ctrl_dep_node == nullptr) {
+      NodeDef new_node;
+      new_node.set_name(ctrl_dep_name);
+      new_node.set_op("Identity");
+      new_node.set_device(fanin_node->device());
+      (*new_node.mutable_attr())["T"].set_type(
+          fanin_node->attr().at("T").type());
+      new_node.add_input(TensorIdToString(fanin));
+      ctrl_dep_node = AddNode(std::move(new_node));
+    }
+    return AddFaninInternal(node, {ctrl_dep_node, Graph::kControlSlot});
+  }
+}
+
+bool MutableGraphView::RemoveRegularFaninInternal(NodeDef* node,
+                                                  const TensorId& fanin) {
+  auto remove_input = [this, node](const TensorId& tensor_id, int port,
+                                   bool update_max_port) {
+    OutputPort fanin_port(nodes()[tensor_id.node()], tensor_id.index());
+    InputPort input(node, port);
+
+    absl::flat_hash_set<InputPort>* fanouts_set = &fanouts()[fanin_port];
+    fanouts_set->erase(input);
+    if (update_max_port) {
+      UpdateMaxRegularOutputPortForRemovedFanin(fanin_port, *fanouts_set);
+    }
+    return fanouts_set;
+  };
+
+  auto mutable_inputs = node->mutable_input();
+  bool modified = false;
+  const int num_inputs = node->input_size();
+  int i;
+  int curr_pos = 0;
+  for (i = 0; i < num_inputs; ++i) {
+    TensorId tensor_id = ParseTensorName(node->input(i));
+    if (IsTensorIdControlling(tensor_id)) {
+      break;
+    }
+    if (tensor_id == fanin) {
+      remove_input(tensor_id, i, /*update_max_port=*/true);
       modified = true;
+    } else if (modified) {
+      // Regular inputs will need to have their ports updated.
+      auto fanouts_set = remove_input(tensor_id, i, /*update_max_port=*/false);
+      fanouts_set->insert({node, curr_pos});
+      // Shift inputs to be retained.
+      mutable_inputs->SwapElements(i, curr_pos++);
     } else {
       // Skip inputs to be retained until first modification.
       curr_pos++;
     }
   }
-  if (modified) {
-    mutable_inputs->DeleteSubrange(curr_pos, num_inputs - curr_pos);
+
+  if (modified && curr_pos < i) {
+    // Remove fanins from node inputs.
+    mutable_inputs->DeleteSubrange(curr_pos, i - curr_pos);
   }
+
   return modified;
 }
 
-bool MutableGraphView::RemoveFanin(absl::string_view node_name,
-                                   const TensorId& fanin) {
-  if (!IsTensorIdPortValid(fanin)) {
+bool MutableGraphView::RemoveRegularFanin(absl::string_view node_name,
+                                          const TensorId& fanin) {
+  if (!IsTensorIdRegular(fanin)) {
     return false;
   }
   NodeDef* node = GetNode(node_name);
   if (node == nullptr) {
     return false;
   }
-  return RemoveFanins(node, {fanin});
+  return RemoveRegularFaninInternal(node, fanin);
+}
+
+bool MutableGraphView::RemoveControllingFaninInternal(NodeDef* node,
+                                                      NodeDef* fanin_node) {
+  for (int i = node->input_size() - 1; i >= 0; --i) {
+    TensorId tensor_id = ParseTensorName(node->input(i));
+    if (tensor_id.index() > Graph::kControlSlot) {
+      break;
+    }
+    if (tensor_id.node() == fanin_node->name()) {
+      fanouts()[{fanin_node, Graph::kControlSlot}].erase(
+          {node, Graph::kControlSlot});
+      node->mutable_input()->SwapElements(i, node->input_size() - 1);
+      node->mutable_input()->RemoveLast();
+      return true;
+    }
+  }
+  return false;
+}
+
+bool MutableGraphView::RemoveControllingFaninInternal(
+    NodeDef* node, absl::string_view fanin_node_name) {
+  NodeDef* fanin = GetNode(fanin_node_name);
+  if (fanin == nullptr) {
+    return false;
+  }
+  return RemoveControllingFaninInternal(node, fanin);
+}
+
+bool MutableGraphView::RemoveControllingFanin(
+    absl::string_view node_name, absl::string_view fanin_node_name) {
+  NodeDef* node = GetNode(node_name);
+  if (node == nullptr) {
+    return false;
+  }
+  return RemoveControllingFaninInternal(node, fanin_node_name);
 }
 
 bool MutableGraphView::RemoveAllFanins(absl::string_view node_name,
@@ -314,15 +540,20 @@ bool MutableGraphView::UpdateFanin(absl::string_view node_name,
     return false;
   }
 
-  bool is_from_fanin_control = from_fanin.index() == Graph::kControlSlot;
-  bool is_to_fanin_control = to_fanin.index() == Graph::kControlSlot;
   // When replacing a non control dependency fanin with a control dependency, or
   // vice versa, remove and add, so ports can be updated properly in fanout(s).
-  if (is_from_fanin_control || is_to_fanin_control) {
-    bool modified = RemoveFanins(node, {from_fanin});
-    if (!HasFanin(*node, to_fanin)) {
-      modified |= AddFaninInternal(node, to_fanin);
+  bool from_fanin_is_control = IsTensorIdControlling(from_fanin);
+  if (from_fanin_is_control || IsTensorIdControlling(to_fanin)) {
+    bool modified = false;
+    if (from_fanin_is_control) {
+      modified |= RemoveControllingFaninInternal(node, from_fanin.node());
+    } else {
+      modified |= RemoveRegularFaninInternal(node, from_fanin);
+    }
+    if (modified) {
+      AddFaninInternal(node, to_fanin);
     }
+
     return modified;
   }
 
@@ -336,133 +567,50 @@ bool MutableGraphView::UpdateFanin(absl::string_view node_name,
   string to_fanin_string = TensorIdToString(to_fanin);
   int num_inputs = node->input_size();
   bool modified = false;
+  absl::flat_hash_set<InputPort>* from_fanin_port_fanouts = nullptr;
+  absl::flat_hash_set<InputPort>* to_fanin_port_fanouts = nullptr;
   for (int i = 0; i < num_inputs; ++i) {
     if (ParseTensorName(node->input(i)) == from_fanin) {
-      OutputPort from_fanin_port(from_fanin_node, from_fanin.index());
       InputPort old_input;
       old_input.node = node;
       old_input.port_id =
-          from_fanin.index() == Graph::kControlSlot ? Graph::kControlSlot : i;
-      fanouts()[from_fanin_port].erase(old_input);
+          IsTensorIdControlling(from_fanin) ? Graph::kControlSlot : i;
+      if (from_fanin_port_fanouts == nullptr) {
+        OutputPort from_fanin_port(from_fanin_node, from_fanin.index());
+        from_fanin_port_fanouts = &fanouts()[from_fanin_port];
+      }
+      from_fanin_port_fanouts->erase(old_input);
 
-      OutputPort to_fanin_port(to_fanin_node, to_fanin.index());
       InputPort new_input;
       new_input.node = node;
       new_input.port_id =
-          to_fanin.index() == Graph::kControlSlot ? Graph::kControlSlot : i;
-      fanouts()[to_fanin_port].insert(new_input);
+          IsTensorIdControlling(to_fanin) ? Graph::kControlSlot : i;
+      if (to_fanin_port_fanouts == nullptr) {
+        OutputPort to_fanin_port(to_fanin_node, to_fanin.index());
+        to_fanin_port_fanouts = &fanouts()[to_fanin_port];
+      }
+      to_fanin_port_fanouts->insert(new_input);
 
       node->set_input(i, to_fanin_string);
       modified = true;
     }
   }
 
-  return modified;
-}
-
-bool MutableGraphView::DedupControllingFanins(NodeDef* node) {
-  absl::flat_hash_set<absl::string_view> fanins;
-  absl::flat_hash_set<string> removed_fanins;
-  int pos = 0;
-  const int last_idx = node->input_size() - 1;
-  int last_pos = last_idx;
-  while (pos <= last_pos) {
-    const string& input = node->input(pos);
-    TensorId tensor_id = ParseTensorName(input);
-    if (!gtl::InsertIfNotPresent(&fanins, tensor_id.node()) &&
-        IsControlInput(tensor_id)) {
-      node->mutable_input()->SwapElements(pos, last_pos--);
-      removed_fanins.insert(input);
-    } else {
-      ++pos;
+  // Dedup control dependencies and update max regular output ports.
+  if (modified) {
+    UpdateMaxRegularOutputPortForRemovedFanin(
+        {from_fanin_node, from_fanin.index()}, *from_fanin_port_fanouts);
+    if (max_regular_output_port()[to_fanin_node] < to_fanin.index()) {
+      max_regular_output_port()[to_fanin_node] = to_fanin.index();
     }
-  }
-
-  if (last_pos < last_idx) {
-    absl::flat_hash_set<string> retained_fanins(
-        node->input().begin(), node->input().begin() + last_pos + 1);
-    for (const auto& removed : removed_fanins) {
-      if (!retained_fanins.contains(removed)) {
-        OutputPort fanin(nodes()[ParseTensorName(removed).node()],
-                         Graph::kControlSlot);
-        fanouts()[fanin].erase({node, Graph::kControlSlot});
-      }
+    if (CanDedupControlWithRegularInput(*this, *to_fanin_node)) {
+      RemoveControllingFaninInternal(node, to_fanin_node);
     }
-    node->mutable_input()->DeleteSubrange(last_pos + 1, last_idx - last_pos);
-    return true;
   }
 
-  return false;
-}
-
-bool MutableGraphView::DedupControllingFanins(absl::string_view node_name) {
-  NodeDef* node = GetNode(node_name);
-  if (node == nullptr) {
-    return false;
-  }
-  return DedupControllingFanins(node);
-}
-
-bool MutableGraphView::DedupControllingFanins() {
-  const int num_nodes = graph()->node_size();
-  bool modified = false;
-  for (int i = 0; i < num_nodes; ++i) {
-    modified |= DedupControllingFanins(graph()->mutable_node(i));
-  }
   return modified;
 }
 
-bool MutableGraphView::AddControllingFanin(absl::string_view node_name,
-                                           const TensorId& fanin) {
-  NodeDef* node = GetNode(node_name);
-  if (node == nullptr) {
-    return false;
-  }
-  NodeDef* fanin_node = GetNode(fanin.node());
-  if (fanin_node == nullptr) {
-    return false;
-  }
-  if (fanin.index() == Graph::kControlSlot) {
-    return AddFaninInternal(node, {fanin_node, Graph::kControlSlot});
-  }
-
-  if (!IsSwitch(*fanin_node)) {
-    return AddFaninInternal(node, {fanin_node, Graph::kControlSlot});
-  } else {
-    // We can't anchor control dependencies directly on the switch node: unlike
-    // other nodes only one of the outputs of the switch node will be generated
-    // when the switch node is executed, and we need to make sure the control
-    // dependency is only triggered when the corresponding output is triggered.
-    // We start by looking for an identity node connected to the output of the
-    // switch node, and use it to anchor the control dependency.
-    auto fanouts = GetFanouts(*fanin_node, /*include_controlled_nodes=*/false);
-    for (auto fanout : fanouts) {
-      if (IsIdentity(*fanout.node) || IsIdentityNSingleInput(*fanout.node)) {
-        if (ParseTensorName(fanout.node->input(0)) == fanin) {
-          return AddFaninInternal(node, {fanout.node, Graph::kControlSlot});
-        }
-      }
-    }
-    // We haven't found an existing node where we can anchor the control
-    // dependency: add a new identity node.
-    string ctrl_dep_name = AddPrefixToNodeName(
-        absl::StrCat(fanin.node(), "_", fanin.index()), kMutableGraphViewCtrl);
-
-    NodeDef* ctrl_dep_node = GetNode(ctrl_dep_name);
-    if (ctrl_dep_node == nullptr) {
-      NodeDef new_node;
-      new_node.set_name(ctrl_dep_name);
-      new_node.set_op("Identity");
-      new_node.set_device(fanin_node->device());
-      (*new_node.mutable_attr())["T"].set_type(
-          fanin_node->attr().at("T").type());
-      new_node.add_input(TensorIdToString(fanin));
-      ctrl_dep_node = AddNode(std::move(new_node));
-    }
-    return AddFaninInternal(node, {ctrl_dep_node, Graph::kControlSlot});
-  }
-}
-
 void MutableGraphView::DeleteNodes(const std::set<string>& nodes_to_delete) {
   for (const string& node_name_to_delete : nodes_to_delete)
     RemoveFaninsInternal(nodes().at(node_name_to_delete),
@@ -476,19 +624,18 @@ void MutableGraphView::RemoveFaninsInternal(NodeDef* deleted_node,
                                             bool keep_controlling_fanins) {
   for (int i = 0; i < deleted_node->input_size(); ++i) {
     TensorId tensor_id = ParseTensorName(deleted_node->input(i));
-    if (keep_controlling_fanins && tensor_id.index() < 0) {
+    if (keep_controlling_fanins && IsTensorIdControlling(tensor_id)) {
       break;
     }
     OutputPort fanin(nodes()[tensor_id.node()], tensor_id.index());
 
     InputPort input;
     input.node = deleted_node;
-    if (tensor_id.index() < 0)
-      input.port_id = Graph::kControlSlot;
-    else
-      input.port_id = i;
+    input.port_id = IsTensorIdControlling(tensor_id) ? Graph::kControlSlot : i;
 
-    fanouts()[fanin].erase(input);
+    absl::flat_hash_set<InputPort>* fanouts_set = &fanouts()[fanin];
+    fanouts_set->erase(input);
+    UpdateMaxRegularOutputPortForRemovedFanin(fanin, *fanouts_set);
   }
 }
 
diff --git a/tensorflow/core/grappler/mutable_graph_view.h b/tensorflow/core/grappler/mutable_graph_view.h
index f7c2a1118e5f879fecca2a1fc37d2e906df19ec4..f07254068d2aff6f8915b2dd9f677bdc0ce458f7 100644
--- a/tensorflow/core/grappler/mutable_graph_view.h
+++ b/tensorflow/core/grappler/mutable_graph_view.h
@@ -24,8 +24,10 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -41,7 +43,7 @@ class MutableGraphView : public internal::GraphViewInternal<GraphDef, NodeDef> {
  public:
   explicit MutableGraphView(GraphDef* graph) : GraphViewInternal(graph) {
     for (NodeDef& node : *graph->mutable_node()) AddUniqueNodeOrDie(&node);
-    for (NodeDef& node : *graph->mutable_node()) AddFanouts(&node);
+    for (NodeDef& node : *graph->mutable_node()) AddAndDedupFanouts(&node);
   }
 
   // Lookup fanouts/fanins using immutable ports.
@@ -63,60 +65,30 @@ class MutableGraphView : public internal::GraphViewInternal<GraphDef, NodeDef> {
   // Updates all fanouts (input ports fetching output tensors) from `from_node`
   // to the `to_node`, including control dependencies.
   //
-  // Example: We have 2 nodes that use `bar` node output tensors as inputs:
-  //   1. foo1(bar:0, bar:1, other:0, ^bar)
+  // Example: We have 3 nodes that use `bar` node output tensors as inputs:
+  //   1. foo1(bar:0, bar:1, other:0)
   //   2. foo2(bar:1, other:1)
+  //   3. foo3(other:2, ^bar)
   //
   // After calling ForwardOutputs(bar, new_bar):
-  //   1. foo1(new_bar:0, new_bar:1, other:0, ^new_bar)
+  //   1. foo1(new_bar:0, new_bar:1, other:0)
   //   2. foo2(new_bar:1, other:1)
-  void UpdateFanouts(absl::string_view from_node, absl::string_view to_node);
-
-  // Add fanin to node `node_name`. If the node or fanin do not exist in the
-  // graph, nothing will be modified in the graph. If fanin is a control
-  // dependency, existing control dependencies will be checked first before
-  // adding. Otherwise fanin will be added after existing non control dependency
-  // inputs.
-  //
-  // This will return true iff the node is modified. If a control dependency
-  // already exists, the node will not be modified.
-  bool AddFanin(absl::string_view node_name, const TensorId& fanin);
-
-  // Remove fanin from node `node_name`. If the node or fanin do not exist in
-  // the graph, nothing will be modified in the graph. If there are multiple
-  // inputs that match the fanin, all of them will be removed.
+  //   3. foo3(other:2, ^new_bar)
   //
-  // This will return true iff the node is modified. If no inputs match the
-  // fanin, the node will not be modified.
-  bool RemoveFanin(absl::string_view node_name, const TensorId& fanin);
+  // This will return true iff the nodes are modified.
+  bool UpdateFanouts(absl::string_view from_node, absl::string_view to_node);
 
-  // Remove all fanins from node `node_name`. Control dependencies will be
-  // retained if keep_controlling_fanins is true.
+  // Adds regular fanin `fanin` to node `node_name`. If the node or fanin do not
+  // exist in the graph, nothing will be modified in the graph. Otherwise fanin
+  // will be added after existing non control dependency fanins. Control
+  // dependencies will be deduped. To add control dependencies, use
+  // AddControllingFanin.
   //
   // This will return true iff the node is modified.
-  bool RemoveAllFanins(absl::string_view node_name,
-                       bool keep_controlling_fanins);
+  bool AddRegularFanin(absl::string_view node_name, const TensorId& fanin);
 
-  // Replace all fanins `from_fanin` with `to_fanin` in node `node_name`. If
-  // the fanins or node do not exist, nothing will be modified in the graph.
-  //
-  // This will return true iff the node is modified.
-  bool UpdateFanin(absl::string_view node_name, const TensorId& from_fanin,
-                   const TensorId& to_fanin);
-
-  // Removes redundant control fanins from node `node_name`.
-  //
-  // This will return true iff the node is modified.
-  // TODO(lyandy): Measure performance of deduping on every AddFanin compared to
-  // deduping once at the end.
-  bool DedupControllingFanins(absl::string_view node_name);
-
-  // Removes redundant control fanins from all nodes in the graph.
-  //
-  // This will return true iff the node is modified.
-  bool DedupControllingFanins();
-
-  // Adds a control dependency to the target node named `node_name`.
+  // Adds control dependency `fanin` to the target node named `node_name`. To
+  // add regular fanins, use AddRegularFanin.
   //
   // Case 1: If the fanin is not a Switch node, the control dependency is simply
   // added to the target node:
@@ -136,31 +108,77 @@ class MutableGraphView : public internal::GraphViewInternal<GraphDef, NodeDef> {
   // This will return true iff the node is modified.
   bool AddControllingFanin(absl::string_view node_name, const TensorId& fanin);
 
+  // Removes regular fanin `fanin` from node `node_name`. If the node or fanin
+  // do not exist in the graph, nothing will be modified in the graph. If there
+  // are multiple inputs that match the fanin, all of them will be removed. To
+  // remove controlling fanins, use RemoveControllingFanin.
+  //
+  // This will return true iff the node is modified.
+  bool RemoveRegularFanin(absl::string_view node_name, const TensorId& fanin);
+
+  // Removes control dependency `fanin_node_name` from the target node named
+  // `node_name`. If the node or fanin do not exist in the graph, nothing will
+  // be modified in the graph. To remove regular fanins, use RemoveRegualrFanin.
+  //
+  // This will return true iff the node is modified.
+  bool RemoveControllingFanin(absl::string_view node_name,
+                              absl::string_view fanin_node_name);
+
+  // Removes all fanins from node `node_name`. Control dependencies will be
+  // retained if keep_controlling_fanins is true.
+  //
+  // This will return true iff the node is modified.
+  bool RemoveAllFanins(absl::string_view node_name,
+                       bool keep_controlling_fanins);
+
+  // Replaces all fanins `from_fanin` with `to_fanin` in node `node_name`. If
+  // the fanins or node do not exist, nothing will be modified in the graph.
+  // Control dependencies will be deduped.
+  //
+  // This will return true iff the node is modified.
+  bool UpdateFanin(absl::string_view node_name, const TensorId& from_fanin,
+                   const TensorId& to_fanin);
+
   // Deletes nodes from the graph.
   void DeleteNodes(const std::set<string>& nodes_to_delete);
 
  private:
+  // Adds fanouts for fanins of node to graph, while deduping control
+  // dependencies from existing control dependencies and regular fanins. Note,
+  // node inputs will be mutated if control dependencies can be deduped.
+  void AddAndDedupFanouts(NodeDef* node);
+
+  // Finds next output port smaller than fanin.port_id and update. The
+  // max_regular_output_port is only updated if fanin.port_id is the same as the
+  // current max_regular_output_port and if the fanouts set is empty. If there
+  // are no regular outputs, max_regular_output_port will be erased.
+  void UpdateMaxRegularOutputPortForRemovedFanin(
+      const OutputPort& fanin,
+      const absl::flat_hash_set<InputPort>& fanin_fanouts);
+
   // Updates all fanouts (input ports fetching output tensors) from `from_node`
   // to the `to_node`, including control dependencies.
   //
-  // Example: We have 2 nodes that use `bar` node output tensors as inputs:
-  //   1. foo1(bar:0, bar:1, other:0, ^bar)
+  // Example: We have 3 nodes that use `bar` node output tensors as inputs:
+  //   1. foo1(bar:0, bar:1, other:0)
   //   2. foo2(bar:1, other:1)
+  //   3. foo3(other:2, ^bar)
   //
   // After calling ForwardOutputs(bar, new_bar):
-  //   1. foo1(new_bar:0, new_bar:1, other:0, ^new_bar)
+  //   1. foo1(new_bar:0, new_bar:1, other:0)
   //   2. foo2(new_bar:1, other:1)
+  //   3. foo3(other:2, ^new_bar)
   //
   // IMPORTANT: If `from_node` or `to_node` is not in the underlying graph, the
   // behavior is undefined.
-  void UpdateFanouts(NodeDef* from_node, NodeDef* to_node);
+  bool UpdateFanoutsInternal(NodeDef* from_node, NodeDef* to_node);
 
   // Removes fanins of the deleted node from internal state. Control
   // dependencies are retained iff keep_controlling_fanins is true.
   void RemoveFaninsInternal(NodeDef* deleted_node,
                             bool keep_controlling_fanins);
 
-  // Add fanin to node. If fanin is a control dependency, existing control
+  // Adds fanin to node. If fanin is a control dependency, existing control
   // dependencies will be checked first before adding. Otherwise fanin will be
   // added after existing non control dependency inputs.
   //
@@ -168,7 +186,7 @@ class MutableGraphView : public internal::GraphViewInternal<GraphDef, NodeDef> {
   // already exists, the node will not be modified.
   bool AddFaninInternal(NodeDef* node, const OutputPort& fanin);
 
-  // Add fanin to node. If the node or fanin do not exist in the graph, nothing
+  // Adds fanin to node. If the node or fanin do not exist in the graph, nothing
   // will be modified in the graph. If fanin is a control dependency, existing
   // control dependencies will be checked first before adding. Otherwise fanin
   // will be added after existing non control dependency inputs.
@@ -178,10 +196,22 @@ class MutableGraphView : public internal::GraphViewInternal<GraphDef, NodeDef> {
   bool AddFaninInternal(NodeDef* node, const TensorId& fanin);
 
   // Removes any fanin in node that matches to a fanin in fanins.
-  bool RemoveFanins(NodeDef* node, absl::Span<const TensorId> fanins);
+  //
+  // This will return true iff the node is modified.
+  bool RemoveRegularFaninInternal(NodeDef* node, const TensorId& fanin);
 
-  // Removes redundant control fanins from node.
-  bool DedupControllingFanins(NodeDef* node);
+  // Removes controlling fanin `fanin_node_name` from node if such controlling
+  // fanin exists.
+  //
+  // This will return true iff the node is modified.
+  bool RemoveControllingFaninInternal(NodeDef* node,
+                                      absl::string_view fanin_node_name);
+
+  // Removes controlling fanin `fanin_node` from node if such controlling fanin
+  // exists.
+  //
+  // This will return true iff the node is modified.
+  bool RemoveControllingFaninInternal(NodeDef* node, NodeDef* fanin_node);
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/mutable_graph_view_test.cc b/tensorflow/core/grappler/mutable_graph_view_test.cc
index cdc212f6f9ecf9575e011e76a4ea1126ae534b6d..2048c67e3e5b28571c6f24bfa186446c40d2002b 100644
--- a/tensorflow/core/grappler/mutable_graph_view_test.cc
+++ b/tensorflow/core/grappler/mutable_graph_view_test.cc
@@ -35,7 +35,8 @@ TEST(MutableGraphViewTest, AddAndUpdateFanouts) {
       {NDef("bar", "NotImportant", {}, {}),
        NDef("other", "NotImportant", {}, {}),
        NDef("foo_1", "NotImportant", {"bar", "other", "bar:1", "^bar"}),
-       NDef("foo_2", "NotImportant", {"other:1", "bar:2", "^bar"})},
+       NDef("foo_2", "NotImportant", {"other:1", "bar:2", "^bar"}),
+       NDef("foo_3", "NotImportant", {"other:2", "^bar"})},
       /*funcs=*/{});
 
   MutableGraphView graph(&graph_def);
@@ -43,7 +44,56 @@ TEST(MutableGraphViewTest, AddAndUpdateFanouts) {
   NodeDef* new_bar = graph.AddNode(NDef("new_bar", "NotImportant", {}, {}));
   NodeDef* bar = graph.GetNode("bar");
 
-  graph.UpdateFanouts(bar->name(), new_bar->name());
+  EXPECT_TRUE(graph.UpdateFanouts(bar->name(), new_bar->name()));
+
+  // Fanout nodes must have their inputs updated.
+  NodeDef* foo_1 = graph.GetNode("foo_1");
+  ASSERT_NE(foo_1, nullptr);
+  ASSERT_EQ(foo_1->input_size(), 3);
+  EXPECT_EQ(foo_1->input(0), "new_bar");
+  EXPECT_EQ(foo_1->input(1), "other");
+  EXPECT_EQ(foo_1->input(2), "new_bar:1");
+
+  NodeDef* foo_2 = graph.GetNode("foo_2");
+  ASSERT_NE(foo_2, nullptr);
+  ASSERT_EQ(foo_2->input_size(), 2);
+  EXPECT_EQ(foo_2->input(0), "other:1");
+  EXPECT_EQ(foo_2->input(1), "new_bar:2");
+
+  NodeDef* foo_3 = graph.GetNode("foo_3");
+  ASSERT_NE(foo_3, nullptr);
+  ASSERT_EQ(foo_3->input_size(), 2);
+  EXPECT_EQ(foo_3->input(0), "other:2");
+  EXPECT_EQ(foo_3->input(1), "^new_bar");
+
+  // And fanouts mapping must be also updated for both nodes.
+  bool include_control_fanouts = true;
+  auto old_node_fanouts = graph.GetFanouts(*bar, include_control_fanouts);
+  auto new_node_fanouts = graph.GetFanouts(*new_bar, include_control_fanouts);
+
+  EXPECT_TRUE(old_node_fanouts.empty());
+
+  EXPECT_EQ(new_node_fanouts.size(), 4);
+  EXPECT_EQ(new_node_fanouts.count(MutableGraphView::InputPort(foo_1, 0)), 1);
+  EXPECT_EQ(new_node_fanouts.count(MutableGraphView::InputPort(foo_1, 2)), 1);
+  EXPECT_EQ(new_node_fanouts.count(MutableGraphView::InputPort(foo_2, 1)), 1);
+  EXPECT_EQ(new_node_fanouts.count(MutableGraphView::InputPort(foo_3, -1)), 1);
+}
+
+TEST(MutableGraphViewTest, AddAndUpdateFanoutsKeepControls) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("bar_1", "Switch", {}, {}), NDef("bar_2", "Identity", {"bar_1:1"}),
+       NDef("other", "NotImportant", {}, {}),
+       NDef("foo_1", "NotImportant", {"bar_2", "other", "bar_2:1", "^bar_2"}),
+       NDef("foo_2", "NotImportant", {"other:1", "bar_2:2", "^bar_2"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* new_bar = graph.AddNode(NDef("new_bar", "Identity", {"bar_1:2"}));
+  NodeDef* bar_2 = graph.GetNode("bar_2");
+
+  EXPECT_TRUE(graph.UpdateFanouts(bar_2->name(), new_bar->name()));
 
   // Fanout nodes must have their inputs updated.
   NodeDef* foo_1 = graph.GetNode("foo_1");
@@ -63,7 +113,7 @@ TEST(MutableGraphViewTest, AddAndUpdateFanouts) {
 
   // And fanouts mapping must be also updated for both nodes.
   bool include_control_fanouts = true;
-  auto old_node_fanouts = graph.GetFanouts(*bar, include_control_fanouts);
+  auto old_node_fanouts = graph.GetFanouts(*bar_2, include_control_fanouts);
   auto new_node_fanouts = graph.GetFanouts(*new_bar, include_control_fanouts);
 
   EXPECT_TRUE(old_node_fanouts.empty());
@@ -78,7 +128,8 @@ TEST(MutableGraphViewTest, AddAndUpdateFanoutsWithoutSelfLoops) {
   // Actual node.op() is not important in this test.
   GraphDef graph_def =
       test::function::GDef({NDef("bar", "NotImportant", {}, {}),
-                            NDef("foo", "NotImportant", {"bar", "^bar"})},
+                            NDef("foo_1", "NotImportant", {"bar", "^bar"}),
+                            NDef("foo_2", "NotImportant", {"^bar"})},
                            /*funcs=*/{});
 
   MutableGraphView graph(&graph_def);
@@ -87,14 +138,18 @@ TEST(MutableGraphViewTest, AddAndUpdateFanoutsWithoutSelfLoops) {
   NodeDef* new_bar = graph.AddNode(NDef("new_bar", "NewBar", {"bar"}, {}));
   NodeDef* bar = graph.GetNode("bar");
 
-  graph.UpdateFanouts("bar", new_bar->name());
+  EXPECT_TRUE(graph.UpdateFanouts("bar", new_bar->name()));
 
   // Foo node must read from `new_bar`.
-  NodeDef* foo = graph.GetNode("foo");
-  ASSERT_NE(foo, nullptr);
-  ASSERT_EQ(foo->input_size(), 2);
-  EXPECT_EQ(foo->input(0), "new_bar");
-  EXPECT_EQ(foo->input(1), "^new_bar");
+  NodeDef* foo_1 = graph.GetNode("foo_1");
+  ASSERT_NE(foo_1, nullptr);
+  ASSERT_EQ(foo_1->input_size(), 1);
+  EXPECT_EQ(foo_1->input(0), "new_bar");
+
+  NodeDef* foo_2 = graph.GetNode("foo_2");
+  ASSERT_NE(foo_2, nullptr);
+  ASSERT_EQ(foo_2->input_size(), 1);
+  EXPECT_EQ(foo_2->input(0), "^new_bar");
 
   // And the `new_bar` should read from the original `bar`.
   ASSERT_EQ(new_bar->input_size(), 1);
@@ -109,8 +164,8 @@ TEST(MutableGraphViewTest, AddAndUpdateFanoutsWithoutSelfLoops) {
   EXPECT_EQ(bar_fanouts.count(MutableGraphView::InputPort(new_bar, 0)), 1);
 
   EXPECT_EQ(new_bar_fanouts.size(), 2);
-  EXPECT_EQ(new_bar_fanouts.count(MutableGraphView::InputPort(foo, 0)), 1);
-  EXPECT_EQ(new_bar_fanouts.count(MutableGraphView::InputPort(foo, -1)), 1);
+  EXPECT_EQ(new_bar_fanouts.count(MutableGraphView::InputPort(foo_1, 0)), 1);
+  EXPECT_EQ(new_bar_fanouts.count(MutableGraphView::InputPort(foo_2, -1)), 1);
 }
 
 GraphDef SimpleMutateFaninGraph() {
@@ -131,10 +186,10 @@ GraphDef SimpleMutateFaninGraph() {
 void CompareNodeInputs(const MutableGraphView& graph, const NodeDef* expected,
                        NodeDef* actual) {
   ASSERT_EQ(actual->input_size(), expected->input_size());
-  int port;
   for (int i = 0; i < actual->input_size(); ++i) {
     EXPECT_EQ(actual->input(i), expected->input(i));
     TensorId tensor_id = ParseTensorName(expected->input(i));
+    int port;
     if (tensor_id.index() == Graph::kControlSlot) {
       port = Graph::kControlSlot;
     } else {
@@ -148,8 +203,9 @@ void CompareNodeInputs(const MutableGraphView& graph, const NodeDef* expected,
   }
 }
 
-void TestAddFanin(absl::string_view node_name, const TensorId& fanin_to_add,
-                  bool modified, const NodeDef* expected_node) {
+void TestAddRegularFanin(absl::string_view node_name,
+                         const TensorId& fanin_to_add, bool modified,
+                         const NodeDef* expected_node) {
   GraphDef graph_def = SimpleMutateFaninGraph();
 
   MutableGraphView graph(&graph_def);
@@ -161,70 +217,71 @@ void TestAddFanin(absl::string_view node_name, const TensorId& fanin_to_add,
     EXPECT_NE(node, nullptr);
   }
 
-  EXPECT_EQ(modified, graph.AddFanin(node_name, fanin_to_add));
+  EXPECT_EQ(modified, graph.AddRegularFanin(node_name, fanin_to_add));
   if (expected_node != nullptr) {
     CompareNodeInputs(graph, expected_node, node);
   }
 }
 
-TEST(MutableGraphViewTest, AddFanin) {
+TEST(MutableGraphViewTest, AddRegularFanin) {
   NodeDef expected_node;
   // Add input to node with 1 input 0 controls.
   expected_node = NDef("", "", {"a", "b:1"});
-  TestAddFanin("foo_1", {"b", 1}, /*modified=*/true, &expected_node);
+  TestAddRegularFanin("foo_1", {"b", 1}, /*modified=*/true, &expected_node);
   // Add input to node with multiple inputs and 0 controls.
   expected_node = NDef("", "", {"b", "a:1", "a:1", "b:2"});
-  TestAddFanin("foo_3", {"b", 2}, /*modified=*/true, &expected_node);
+  TestAddRegularFanin("foo_3", {"b", 2}, /*modified=*/true, &expected_node);
   // Add input to node with 1 input multiple controls.
-  expected_node = NDef("", "", {"b", "a", "^c", "^a"});
-  TestAddFanin("foo_2", {"a", 0}, /*modified=*/true, &expected_node);
+  expected_node = NDef("", "", {"b", "a", "^c"});
+  TestAddRegularFanin("foo_2", {"a", 0}, /*modified=*/true, &expected_node);
   // Add input to node with multiple inputs and controls.
   expected_node = NDef("", "", {"a", "b:2", "b:2", "a:1", "^d", "^c"});
-  TestAddFanin("foo_4", {"a", 1}, /*modified=*/true, &expected_node);
+  TestAddRegularFanin("foo_4", {"a", 1}, /*modified=*/true, &expected_node);
   // Add input to node with 0 inputs 0 controls.
   expected_node = NDef("", "", {"a:1"});
-  TestAddFanin("foo_5", {"a", 1}, /*modified=*/true, &expected_node);
+  TestAddRegularFanin("foo_5", {"a", 1}, /*modified=*/true, &expected_node);
   // Add input to node with 0 inputs multiple controls.
   expected_node = NDef("", "", {"c:1", "^b", "^a"});
-  TestAddFanin("foo_6", {"c", 1}, /*modified=*/true, &expected_node);
+  TestAddRegularFanin("foo_6", {"c", 1}, /*modified=*/true, &expected_node);
 
   // Add control to node with 1 input 0 controls.
-  expected_node = NDef("", "", {"a", "^b"});
-  TestAddFanin("foo_1", {"b", Graph::kControlSlot}, /*modified=*/true,
-               &expected_node);
+  expected_node = NDef("", "", {"a"});
+  TestAddRegularFanin("foo_1", {"b", Graph::kControlSlot}, /*modified=*/false,
+                      &expected_node);
   // Add control to node with multiple inputs and 0 controls.
-  expected_node = NDef("", "", {"b", "a:1", "a:1", "^c"});
-  TestAddFanin("foo_3", {"c", Graph::kControlSlot}, /*modified=*/true,
-               &expected_node);
+  expected_node = NDef("", "", {"b", "a:1", "a:1"});
+  TestAddRegularFanin("foo_3", {"c", Graph::kControlSlot}, /*modified=*/false,
+                      &expected_node);
   // Add control to node with 1 input multiple controls.
-  expected_node = NDef("", "", {"b", "^a", "^c", "^d"});
-  TestAddFanin("foo_2", {"d", Graph::kControlSlot}, /*modified=*/true,
-               &expected_node);
+  expected_node = NDef("", "", {"b", "^a", "^c"});
+  TestAddRegularFanin("foo_2", {"d", Graph::kControlSlot}, /*modified=*/false,
+                      &expected_node);
   // Add control to node with multiple input multiple controls.
-  expected_node = NDef("", "", {"a", "b:2", "b:2", "^c", "^d", "^a"});
-  TestAddFanin("foo_4", {"a", Graph::kControlSlot}, /*modified=*/true,
-               &expected_node);
+  expected_node = NDef("", "", {"a", "b:2", "b:2", "^c", "^d"});
+  TestAddRegularFanin("foo_4", {"a", Graph::kControlSlot},
+                      /*modified=*/false, &expected_node);
   // Add control to node with 0 inputs 0 controls.
-  expected_node = NDef("", "", {"^a"});
-  TestAddFanin("foo_5", {"a", Graph::kControlSlot}, /*modified=*/true,
-               &expected_node);
+  expected_node = NDef("", "", {});
+  TestAddRegularFanin("foo_5", {"a", Graph::kControlSlot}, /*modified=*/false,
+                      &expected_node);
   // Add control to node with 0 inputs multiple controls.
-  expected_node = NDef("", "", {"^a", "^b", "^c"});
-  TestAddFanin("foo_6", {"c", Graph::kControlSlot}, /*modified=*/true,
-               &expected_node);
+  expected_node = NDef("", "", {"^a", "^b"});
+  TestAddRegularFanin("foo_6", {"c", Graph::kControlSlot}, /*modified=*/false,
+                      &expected_node);
   // Add control to node with control that already exists.
   expected_node = NDef("", "", {"b", "^a", "^c"});
-  TestAddFanin("foo_2", {"a", Graph::kControlSlot}, /*modified=*/false,
-               &expected_node);
+  TestAddRegularFanin("foo_2", {"a", Graph::kControlSlot},
+                      /*modified=*/false, &expected_node);
 
   // Add fanin to node where node is missing.
-  TestAddFanin("foo_missing", {"a", 0}, /*modified=*/false, nullptr);
+  TestAddRegularFanin("foo_missing", {"a", 0}, /*modified=*/false, nullptr);
   // Add fanin to node where fanin is missing.
   expected_node = NDef("", "", {"a"});
-  TestAddFanin("foo_1", {"bar_missing", 0}, /*modified=*/false, &expected_node);
+  TestAddRegularFanin("foo_1", {"bar_missing", 0}, /*modified=*/false,
+                      &expected_node);
   // Add fanin to node where node and fanin are missing.
-  TestAddFanin("foo_missing", {"bar_missing", 0}, /*modified=*/false,
-               /*expected_node=*/nullptr);
+  TestAddRegularFanin("foo_missing", {"bar_missing", 0}, /*modified=*/false,
+                      /*expected_node=*/nullptr);
 }
 
 void CheckFanout(const MutableGraphView& graph, const TensorId& fanin,
@@ -237,9 +294,9 @@ void CheckFanout(const MutableGraphView& graph, const TensorId& fanin,
   }
 }
 
-void TestRemoveFanin(absl::string_view node_name,
-                     const TensorId& fanin_to_remove, bool modified,
-                     const NodeDef* expected_node) {
+void TestRemoveRegularFanin(absl::string_view node_name,
+                            const TensorId& fanin_to_remove, bool modified,
+                            const NodeDef* expected_node) {
   GraphDef graph_def = SimpleMutateFaninGraph();
 
   MutableGraphView graph(&graph_def);
@@ -251,7 +308,7 @@ void TestRemoveFanin(absl::string_view node_name,
     EXPECT_NE(nullptr, node);
   }
 
-  EXPECT_EQ(modified, graph.RemoveFanin(node_name, fanin_to_remove));
+  EXPECT_EQ(modified, graph.RemoveRegularFanin(node_name, fanin_to_remove));
   if (expected_node != nullptr) {
     CompareNodeInputs(graph, expected_node, node);
     if (modified) {
@@ -260,63 +317,68 @@ void TestRemoveFanin(absl::string_view node_name,
   }
 }
 
-TEST(MutableGraphViewTest, RemoveFanin) {
+TEST(MutableGraphViewTest, RemoveRegularFanin) {
   NodeDef expected_node;
   // Remove input from node with 1 input 0 controls.
   expected_node = NDef("", "", {});
-  TestRemoveFanin("foo_1", {"a", 0}, /*modified=*/true, &expected_node);
+  TestRemoveRegularFanin("foo_1", {"a", 0}, /*modified=*/true, &expected_node);
   // Remove input from node with multiple inputs and 0 controls.
   expected_node = NDef("", "", {"b"});
-  TestRemoveFanin("foo_3", {"a", 1}, /*modified=*/true, &expected_node);
+  TestRemoveRegularFanin("foo_3", {"a", 1}, /*modified=*/true, &expected_node);
   // Remove input from node with 1 input multiple controls.
   expected_node = NDef("", "", {"^a", "^c"});
-  TestRemoveFanin("foo_2", {"b", 0}, /*modified=*/true, &expected_node);
+  TestRemoveRegularFanin("foo_2", {"b", 0}, /*modified=*/true, &expected_node);
   // Remove input from node with multiple inputs and controls.
   expected_node = NDef("", "", {"a", "^c", "^d"});
-  TestRemoveFanin("foo_4", {"b", 2}, /*modified=*/true, &expected_node);
+  TestRemoveRegularFanin("foo_4", {"b", 2}, /*modified=*/true, &expected_node);
+  // Remove input from node with multiple inputs and controls, and results in
+  // shifting of ports.
+  expected_node = NDef("", "", {"b:2", "b:2", "^c", "^d"});
+  TestRemoveRegularFanin("foo_4", {"a", 0}, /*modified=*/true, &expected_node);
 
   // Remove control from node with 1 input multiple controls.
-  expected_node = NDef("", "", {"b", "^c"});
-  TestRemoveFanin("foo_2", {"a", Graph::kControlSlot}, /*modified=*/true,
-                  &expected_node);
+  expected_node = NDef("", "", {"b", "^a", "^c"});
+  TestRemoveRegularFanin("foo_2", {"a", Graph::kControlSlot},
+                         /*modified=*/false, &expected_node);
   // Remove control from node with multiple input multiple controls.
-  expected_node = NDef("", "", {"a", "b:2", "b:2", "^c"});
-  TestRemoveFanin("foo_4", {"d", Graph::kControlSlot}, /*modified=*/true,
-                  &expected_node);
+  expected_node = NDef("", "", {"a", "b:2", "b:2", "^c", "^d"});
+  TestRemoveRegularFanin("foo_4", {"d", Graph::kControlSlot},
+                         /*modified=*/false, &expected_node);
   // Remove control from node with 0 inputs multiple controls.
-  expected_node = NDef("", "", {"^b"});
-  TestRemoveFanin("foo_6", {"a", Graph::kControlSlot}, /*modified=*/true,
-                  &expected_node);
+  expected_node = NDef("", "", {"^a", "^b"});
+  TestRemoveRegularFanin("foo_6", {"a", Graph::kControlSlot},
+                         /*modified=*/false, &expected_node);
 
   // Remove input from node with 0 inputs 0 controls.
   expected_node = NDef("", "", {});
-  TestRemoveFanin("foo_5", {"a", 1}, /*modified=*/false, &expected_node);
+  TestRemoveRegularFanin("foo_5", {"a", 1}, /*modified=*/false, &expected_node);
   // Remove input from node with 0 inputs multiple controls.
   expected_node = NDef("", "", {"^a", "^b"});
-  TestRemoveFanin("foo_6", {"a", 1}, /*modified=*/false, &expected_node);
+  TestRemoveRegularFanin("foo_6", {"a", 1}, /*modified=*/false, &expected_node);
+
   // Remove control from node with 1 input 0 controls.
   expected_node = NDef("", "", {"a"});
-  TestRemoveFanin("foo_1", {"b", Graph::kControlSlot}, /*modified=*/false,
-                  &expected_node);
+  TestRemoveRegularFanin("foo_1", {"b", Graph::kControlSlot},
+                         /*modified=*/false, &expected_node);
   // Remove control from node with multiple inputs and 0 controls.
   expected_node = NDef("", "", {"b", "a:1", "a:1"});
-  TestRemoveFanin("foo_3", {"c", Graph::kControlSlot}, /*modified=*/false,
-                  &expected_node);
+  TestRemoveRegularFanin("foo_3", {"c", Graph::kControlSlot},
+                         /*modified=*/false, &expected_node);
   // Remove control from node with 0 inputs 0 controls.
   expected_node = NDef("", "", {});
-  TestRemoveFanin("foo_5", {"a", Graph::kControlSlot}, /*modified=*/false,
-                  &expected_node);
+  TestRemoveRegularFanin("foo_5", {"a", Graph::kControlSlot},
+                         /*modified=*/false, &expected_node);
 
   // Remove fanin from node where node is missing.
-  TestRemoveFanin("foo_missing", {"a", 0}, /*modified=*/false,
-                  /*expected_node=*/nullptr);
+  TestRemoveRegularFanin("foo_missing", {"a", 0}, /*modified=*/false,
+                         /*expected_node=*/nullptr);
   // Remove fanin from node where fanin is missing.
   expected_node = NDef("", "", {"a"});
-  TestRemoveFanin("foo_1", {"bar_missing", 0}, /*modified=*/false,
-                  &expected_node);
+  TestRemoveRegularFanin("foo_1", {"bar_missing", 0}, /*modified=*/false,
+                         &expected_node);
   // Remove fanin from node where node and fanin are missing.
-  TestRemoveFanin("foo_missing", {"bar_missing", 0}, /*modified=*/false,
-                  /*expected_node=*/nullptr);
+  TestRemoveRegularFanin("foo_missing", {"bar_missing", 0}, /*modified=*/false,
+                         /*expected_node=*/nullptr);
 }
 
 void TestRemoveAllFanins(absl::string_view node_name,
@@ -431,7 +493,7 @@ TEST(MutableGraphViewTest, UpdateFanin) {
   TestUpdateFanin("foo_4", {"d", Graph::kControlSlot}, {"d", 1},
                   /*modified=*/true, &expected_node);
   // Update fanin from control to control.
-  expected_node = NDef("", "", {"a", "b:2", "b:2", "^d", "^b"});
+  expected_node = NDef("", "", {"a", "b:2", "b:2", "^d"});
   TestUpdateFanin("foo_4", {"c", Graph::kControlSlot},
                   {"b", Graph::kControlSlot}, /*modified=*/true,
                   &expected_node);
@@ -463,85 +525,278 @@ TEST(MutableGraphViewTest, UpdateFanin) {
                   /*modified=*/false, /*expected_node=*/nullptr);
 }
 
-GraphDef SimpleDuplicateControllingFaninsGraph() {
+TEST(MutableGraphViewTest, DedupControllingFaninsOnGraphInit) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {
+          NDef("a", "NotImportant", {}, {}),
+          NDef("b", "NotImportant", {}, {}),
+          NDef("c", "Switch", {}, {}),
+          NDef("d", "Identity", {"c:1"}),
+          NDef("foo_1", "IdentityN", {"a", "b:1", "^b"}),
+          NDef("foo_2", "IdentityN", {"a", "^b", "^b"}),
+          NDef("foo_3", "IdentityN", {"a", "b:1", "^b", "^b"}),
+          NDef("foo_4", "IdentityN", {"a:2", "b:1", "^b", "^b", "^a", "^a"}),
+          NDef("foo_5", "NotImportant", {"a:2", "b:1", "^b", "^b", "^a", "^a"}),
+          NDef("foo_6", "Identity", {"d", "^d"}),
+          NDef("foo_7", "NotImportant",
+               {"a:3", "b:2", "d", "^d", "^d", "^a", "^b", "^a", "^b"}),
+      },
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  EXPECT_EQ(graph.graph()->node_size(), 11);
+  NodeDef expected;
+  NodeDef* a = graph.GetNode("a");
+  ASSERT_NE(a, nullptr);
+  expected = NDef("", "", {});
+  CompareNodeInputs(graph, &expected, a);
+
+  NodeDef* b = graph.GetNode("b");
+  ASSERT_NE(b, nullptr);
+  CompareNodeInputs(graph, &expected, b);
+
+  NodeDef* c = graph.GetNode("c");
+  ASSERT_NE(c, nullptr);
+  CompareNodeInputs(graph, &expected, c);
+
+  NodeDef* d = graph.GetNode("d");
+  ASSERT_NE(d, nullptr);
+  expected = NDef("", "", {"c:1"});
+  CompareNodeInputs(graph, &expected, d);
+
+  NodeDef* foo_1 = graph.GetNode("foo_1");
+  ASSERT_NE(foo_1, nullptr);
+  expected = NDef("", "", {"a", "b:1"});
+  CompareNodeInputs(graph, &expected, foo_1);
+
+  NodeDef* foo_2 = graph.GetNode("foo_2");
+  ASSERT_NE(foo_2, nullptr);
+  expected = NDef("", "", {"a", "^b"});
+  CompareNodeInputs(graph, &expected, foo_2);
+
+  NodeDef* foo_3 = graph.GetNode("foo_3");
+  ASSERT_NE(foo_3, nullptr);
+  expected = NDef("", "", {"a", "b:1"});
+  CompareNodeInputs(graph, &expected, foo_3);
+
+  NodeDef* foo_4 = graph.GetNode("foo_4");
+  ASSERT_NE(foo_4, nullptr);
+  expected = NDef("", "", {"a:2", "b:1"});
+  CompareNodeInputs(graph, &expected, foo_4);
+
+  NodeDef* foo_5 = graph.GetNode("foo_5");
+  ASSERT_NE(foo_5, nullptr);
+  expected = NDef("", "", {"a:2", "b:1"});
+  CompareNodeInputs(graph, &expected, foo_5);
+
+  NodeDef* foo_6 = graph.GetNode("foo_6");
+  ASSERT_NE(foo_6, nullptr);
+  expected = NDef("", "", {"d", "^d"});
+  CompareNodeInputs(graph, &expected, foo_6);
+
+  NodeDef* foo_7 = graph.GetNode("foo_7");
+  ASSERT_NE(foo_7, nullptr);
+  expected = NDef("", "", {"a:3", "b:2", "d", "^d"});
+  CompareNodeInputs(graph, &expected, foo_7);
+}
+
+TEST(MutableGraphViewTest, DedupControllingFaninsOnAddFanin) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"^a"}),
+       NDef("c", "NotImportant", {"a:1"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  EXPECT_TRUE(graph.AddRegularFanin("b", {"a", 2}));
+  NodeDef expected;
+  NodeDef* b = graph.GetNode("b");
+  ASSERT_NE(b, nullptr);
+  expected = NDef("", "", {"a:2"});
+  CompareNodeInputs(graph, &expected, b);
+
+  EXPECT_FALSE(graph.AddControllingFanin("c", {"a", Graph::kControlSlot}));
+  NodeDef* c = graph.GetNode("c");
+  ASSERT_NE(c, nullptr);
+  expected = NDef("", "", {"a:1"});
+  CompareNodeInputs(graph, &expected, c);
+}
+
+TEST(MutableGraphViewTest, NoDedupControlFlowControllingFaninsOnAddFanin) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "Switch", {}, {}), NDef("b", "Identity", {"a:1"}),
+       NDef("c", "", {}, {}), NDef("d", "", {}, {})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  EXPECT_TRUE(graph.AddRegularFanin("c", {"b", 2}));
+  NodeDef expected;
+  NodeDef* c = graph.GetNode("c");
+  ASSERT_NE(c, nullptr);
+  expected = NDef("", "", {"b:2"});
+  CompareNodeInputs(graph, &expected, c);
+  EXPECT_TRUE(graph.AddControllingFanin("c", {"b", Graph::kControlSlot}));
+  expected = NDef("", "", {"b:2", "^b"});
+  CompareNodeInputs(graph, &expected, c);
+  EXPECT_FALSE(graph.AddControllingFanin("c", {"b", Graph::kControlSlot}));
+  expected = NDef("", "", {"b:2", "^b"});
+  CompareNodeInputs(graph, &expected, c);
+
+  EXPECT_TRUE(graph.AddControllingFanin("d", {"b", Graph::kControlSlot}));
+  NodeDef* d = graph.GetNode("d");
+  ASSERT_NE(d, nullptr);
+  expected = NDef("", "", {"^b"});
+  CompareNodeInputs(graph, &expected, d);
+  EXPECT_FALSE(graph.AddControllingFanin("d", {"b", Graph::kControlSlot}));
+  expected = NDef("", "", {"^b"});
+  CompareNodeInputs(graph, &expected, d);
+  EXPECT_TRUE(graph.AddRegularFanin("d", {"b", 3}));
+  expected = NDef("", "", {"b:3", "^b"});
+  CompareNodeInputs(graph, &expected, d);
+}
+
+TEST(MutableGraphViewTest, DedupControllingFaninsOnUpdateFanin) {
   // Actual node.op() is not important in this test.
   GraphDef graph_def = test::function::GDef(
       {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {}, {}),
-       NDef("foo_1", "NotImportant", {"a", "b:1", "^b"}),
-       NDef("foo_2", "NotImportant", {"a", "^b", "^b"}),
-       NDef("foo_3", "NotImportant", {"a", "b:1", "^b", "^b"}),
-       NDef("foo_4", "NotImportant", {"a:2", "b:1", "^b", "^b", "^a", "^a"})},
+       NDef("c", "NotImportant", {"a:1", "^b"})},
       /*funcs=*/{});
-  return graph_def;
+
+  MutableGraphView graph(&graph_def);
+
+  EXPECT_TRUE(graph.UpdateFanin("c", {"a", 1}, {"b", 2}));
+  NodeDef* c = graph.GetNode("c");
+  ASSERT_NE(c, nullptr);
+  NodeDef expected = NDef("", "", {"b:2"});
+  CompareNodeInputs(graph, &expected, c);
 }
 
-void CheckDedupControllingFaninsForNode(MutableGraphView* graph,
-                                        absl::string_view node_name,
-                                        const NodeDef* expected_node) {
-  // Deduping again should result in no change.
-  EXPECT_FALSE(graph->DedupControllingFanins(node_name));
-  NodeDef* node = graph->GetNode(node_name);
-  ASSERT_NE(node, nullptr);
-  ASSERT_EQ(node->input_size(), expected_node->input_size());
-  CompareNodeInputs(*graph, expected_node, node);
-  for (int i = 0; i < node->input_size(); ++i) {
-    TensorId tensor_id = ParseTensorName(node->input(i));
-    if (tensor_id.index() > Graph::kControlSlot) {
-      CheckFanout(*graph, {tensor_id.node(), Graph::kControlSlot}, node_name);
-    }
-  }
+TEST(MutableGraphViewTest, NoDedupControlFlowControllingFaninsOnUpdateFanin) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "Switch", {}, {}), NDef("b", "Identity", {"a:1"}),
+       NDef("c", "Identity", {"a:2"}), NDef("d", "NotImportant", {"c", "^b"}),
+       NDef("e", "NotImportant", {"b", "^c"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  EXPECT_TRUE(graph.UpdateFanin("d", {"b", Graph::kControlSlot},
+                                {"c", Graph::kControlSlot}));
+  NodeDef expected;
+  NodeDef* d = graph.GetNode("d");
+  ASSERT_NE(d, nullptr);
+  expected = NDef("", "", {"c", "^c"});
+  CompareNodeInputs(graph, &expected, d);
+
+  EXPECT_TRUE(graph.UpdateFanin("e", {"b", 0}, {"c", 3}));
+  NodeDef* e = graph.GetNode("e");
+  ASSERT_NE(e, nullptr);
+  expected = NDef("", "", {"c:3", "^c"});
+  CompareNodeInputs(graph, &expected, e);
+
+  EXPECT_TRUE(graph.UpdateFanin("e", {"c", 3}, {"c", Graph::kControlSlot}));
+  ASSERT_NE(e, nullptr);
+  expected = NDef("", "", {"^c"});
+  CompareNodeInputs(graph, &expected, e);
 }
 
-void TestDedupControllingFaninsForNode(MutableGraphView* graph,
-                                       absl::string_view node_name,
-                                       const NodeDef* expected_node) {
-  EXPECT_TRUE(graph->DedupControllingFanins(node_name));
-  CheckDedupControllingFaninsForNode(graph, node_name, expected_node);
+TEST(MutableGraphViewTest, UpdateMaxRegularOutputPortOnAddFanin) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a:1"}),
+       NDef("c", "NotImportant", {"^b"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  EXPECT_TRUE(graph.AddRegularFanin("c", {"a", 3}));
+  NodeDef* a = graph.GetNode("a");
+  ASSERT_NE(a, nullptr);
+
+  auto fanouts = graph.GetFanouts(*a, /*include_controlled_nodes=*/true);
+  EXPECT_EQ(fanouts.size(), 2);
+
+  NodeDef* b = graph.GetNode("b");
+  ASSERT_NE(b, nullptr);
+  EXPECT_EQ(fanouts.count(MutableGraphView::InputPort(b, 0)), 1);
+
+  NodeDef* c = graph.GetNode("c");
+  ASSERT_NE(c, nullptr);
+  EXPECT_EQ(fanouts.count(MutableGraphView::InputPort(c, 0)), 1);
 }
 
-TEST(MutableGraphViewTest, DedupControllingFaninsForNode) {
-  GraphDef graph_def = SimpleDuplicateControllingFaninsGraph();
+TEST(MutableGraphViewTest, UpdateMaxRegularOutputPortOnRemoveFanin) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a:1"}),
+       NDef("c", "NotImportant", {"a:2"})},
+      /*funcs=*/{});
 
   MutableGraphView graph(&graph_def);
 
-  NodeDef expected_node;
-  // Remove redundant control dependency '^b'.
-  expected_node = NDef("", "", {"a", "b:1"});
-  TestDedupControllingFaninsForNode(&graph, "foo_1", &expected_node);
-  // Remove extra control dependency '^b'.
-  expected_node = NDef("", "", {"a", "^b"});
-  TestDedupControllingFaninsForNode(&graph, "foo_2", &expected_node);
-  // Remove redundant and extra control dependencies '^b'.
-  expected_node = NDef("", "", {"a", "b:1"});
-  TestDedupControllingFaninsForNode(&graph, "foo_3", &expected_node);
-  // Remove multiple redundant control dependencies.
-  expected_node = NDef("", "", {"a:2", "b:1"});
-  TestDedupControllingFaninsForNode(&graph, "foo_4", &expected_node);
-  // Missing node.
-  EXPECT_FALSE(graph.DedupControllingFanins("missing"));
+  EXPECT_TRUE(graph.RemoveRegularFanin("c", {"a", 2}));
+  NodeDef* a = graph.GetNode("a");
+  ASSERT_NE(a, nullptr);
+
+  auto fanouts = graph.GetFanouts(*a, /*include_controlled_nodes=*/true);
+  EXPECT_EQ(fanouts.size(), 1);
+
+  NodeDef* b = graph.GetNode("b");
+  ASSERT_NE(b, nullptr);
+  EXPECT_EQ(fanouts.count(MutableGraphView::InputPort(b, 0)), 1);
 }
 
-TEST(MutableGraphViewTest, DedupControllingFaninsForGraph) {
-  GraphDef graph_def = SimpleDuplicateControllingFaninsGraph();
+TEST(MutableGraphViewTest, KeepMaxRegularOutputPortOnRemoveFanin) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a:1"}),
+       NDef("c", "NotImportant", {"a:2"})},
+      /*funcs=*/{});
 
   MutableGraphView graph(&graph_def);
-  EXPECT_TRUE(graph.DedupControllingFanins());
-  // Deduping again should result in no change.
-  EXPECT_FALSE(graph.DedupControllingFanins());
 
-  NodeDef expected_node;
-  // Remove redundant control dependency '^b'.
-  expected_node = NDef("", "", {"a", "b:1"});
-  CheckDedupControllingFaninsForNode(&graph, "foo_1", &expected_node);
-  // Remove extra control dependency '^b'.
-  expected_node = NDef("", "", {"a", "^b"});
-  CheckDedupControllingFaninsForNode(&graph, "foo_2", &expected_node);
-  // Remove redundant and extra control dependencies '^b'.
-  expected_node = NDef("", "", {"a", "b:1"});
-  CheckDedupControllingFaninsForNode(&graph, "foo_3", &expected_node);
-  // Remove multiple redundant control dependencies.
-  expected_node = NDef("", "", {"a:2", "b:1"});
-  CheckDedupControllingFaninsForNode(&graph, "foo_4", &expected_node);
+  EXPECT_TRUE(graph.RemoveRegularFanin("b", {"a", 1}));
+  NodeDef* a = graph.GetNode("a");
+  ASSERT_NE(a, nullptr);
+
+  auto fanouts = graph.GetFanouts(*a, /*include_controlled_nodes=*/true);
+  EXPECT_EQ(fanouts.size(), 1);
+
+  NodeDef* c = graph.GetNode("c");
+  ASSERT_NE(c, nullptr);
+  EXPECT_EQ(fanouts.count(MutableGraphView::InputPort(c, 0)), 1);
+}
+
+TEST(MutableGraphViewTest, UpdateMaxRegularOutputPortOnUpdateFanin) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a:1"}),
+       NDef("c", "NotImportant", {"a:2"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  EXPECT_TRUE(graph.UpdateFanin("c", {"a", 2}, {"b", 3}));
+  NodeDef* a = graph.GetNode("a");
+  ASSERT_NE(a, nullptr);
+
+  auto a_fanouts = graph.GetFanouts(*a, /*include_controlled_nodes=*/true);
+  EXPECT_EQ(a_fanouts.size(), 1);
+
+  NodeDef* b = graph.GetNode("b");
+  ASSERT_NE(b, nullptr);
+  EXPECT_EQ(a_fanouts.count(MutableGraphView::InputPort(b, 0)), 1);
+
+  auto b_fanouts = graph.GetFanouts(*b, /*include_controlled_nodes=*/true);
+  EXPECT_EQ(b_fanouts.size(), 1);
+
+  NodeDef* c = graph.GetNode("c");
+  ASSERT_NE(c, nullptr);
+  EXPECT_EQ(b_fanouts.count(MutableGraphView::InputPort(c, 0)), 1);
 }
 
 TEST(MutableGraphViewTest, AddControllingFaninMissing) {
@@ -559,12 +814,15 @@ TEST(MutableGraphViewTest, AddControllingFaninMissing) {
   EXPECT_FALSE(graph.AddControllingFanin("c", {"d", Graph::kControlSlot}));
 
   ASSERT_EQ(graph.graph()->node_size(), 2);
+  NodeDef expected;
   NodeDef* a = graph.GetNode("a");
   ASSERT_NE(a, nullptr);
-  ASSERT_EQ(a->input_size(), 0);
+  expected = NDef("", "", {});
+  CompareNodeInputs(graph, &expected, a);
+
   NodeDef* b = graph.GetNode("b");
   ASSERT_NE(b, nullptr);
-  ASSERT_EQ(b->input_size(), 0);
+  CompareNodeInputs(graph, &expected, b);
 }
 
 TEST(MutableGraphViewTest, AddControllingFaninExistingControl) {
@@ -578,13 +836,16 @@ TEST(MutableGraphViewTest, AddControllingFaninExistingControl) {
   EXPECT_FALSE(graph.AddControllingFanin("a", {"b", Graph::kControlSlot}));
 
   ASSERT_EQ(graph.graph()->node_size(), 2);
+  NodeDef expected;
   NodeDef* a = graph.GetNode("a");
   ASSERT_NE(a, nullptr);
-  ASSERT_EQ(a->input_size(), 1);
-  EXPECT_EQ(a->input(0), "^b");
+  expected = NDef("", "", {"^b"});
+  CompareNodeInputs(graph, &expected, a);
+
   NodeDef* b = graph.GetNode("b");
   ASSERT_NE(b, nullptr);
-  ASSERT_EQ(b->input_size(), 0);
+  expected = NDef("", "", {});
+  CompareNodeInputs(graph, &expected, b);
 }
 
 TEST(MutableGraphViewTest, AddControllingFaninNotSwitch) {
@@ -598,13 +859,16 @@ TEST(MutableGraphViewTest, AddControllingFaninNotSwitch) {
   EXPECT_FALSE(graph.AddControllingFanin("a", {"b", 2}));
 
   ASSERT_EQ(graph.graph()->node_size(), 2);
+  NodeDef expected;
   NodeDef* a = graph.GetNode("a");
   ASSERT_NE(a, nullptr);
-  ASSERT_EQ(a->input_size(), 1);
-  EXPECT_EQ(a->input(0), "^b");
+  expected = NDef("", "", {"^b"});
+  CompareNodeInputs(graph, &expected, a);
+
   NodeDef* b = graph.GetNode("b");
   ASSERT_NE(b, nullptr);
-  ASSERT_EQ(b->input_size(), 0);
+  expected = NDef("", "", {});
+  CompareNodeInputs(graph, &expected, b);
 }
 
 TEST(MutableGraphViewTest, AddControllingFaninSwitchWithIdentity) {
@@ -621,8 +885,8 @@ TEST(MutableGraphViewTest, AddControllingFaninSwitchWithIdentity) {
   ASSERT_EQ(graph.graph()->node_size(), 3);
   NodeDef* a = graph.GetNode("a");
   ASSERT_NE(a, nullptr);
-  ASSERT_EQ(a->input_size(), 1);
-  EXPECT_EQ(a->input(0), "^identity");
+  NodeDef expected = NDef("", "", {"^identity"});
+  CompareNodeInputs(graph, &expected, a);
 }
 
 TEST(MutableGraphViewTest, AddControllingFaninSwitchWithNoExistingIdentity) {
@@ -638,14 +902,16 @@ TEST(MutableGraphViewTest, AddControllingFaninSwitchWithNoExistingIdentity) {
   EXPECT_FALSE(graph.AddControllingFanin("a", {"switch", 0}));
 
   ASSERT_EQ(graph.graph()->node_size(), 3);
+  NodeDef expected;
   NodeDef* a = graph.GetNode("a");
   ASSERT_NE(a, nullptr);
-  ASSERT_EQ(a->input_size(), 1);
-  EXPECT_EQ(a->input(0), "^ConstantFoldingCtrl/switch_0");
+  expected = NDef("", "", {"^ConstantFoldingCtrl/switch_0"});
+  CompareNodeInputs(graph, &expected, a);
+
   NodeDef* identity = graph.GetNode("ConstantFoldingCtrl/switch_0");
   ASSERT_NE(identity, nullptr);
-  ASSERT_EQ(identity->input_size(), 1);
-  EXPECT_EQ(identity->input(0), "switch");
+  expected = NDef("", "", {"switch"});
+  CompareNodeInputs(graph, &expected, identity);
   EXPECT_EQ(identity->op(), "Identity");
   EXPECT_EQ(identity->device(), kDevice);
   ASSERT_TRUE(identity->attr().count("T"));
@@ -655,7 +921,7 @@ TEST(MutableGraphViewTest, AddControllingFaninSwitchWithNoExistingIdentity) {
 TEST(MutableGraphViewTest, AddControllingFaninSwitchWithExistingAddedIdentity) {
   GraphDef graph_def = test::function::GDef(
       {NDef("a", "NotImportant", {}, {}), NDef("switch", "Switch", {}, {}),
-       NDef("ConstantFoldingCtrl/switch_0", "Identity", {}, {})},
+       NDef("ConstantFoldingCtrl/switch_0", "Identity", {"switch"})},
       /*funcs=*/{});
 
   MutableGraphView graph(&graph_def);
@@ -666,8 +932,72 @@ TEST(MutableGraphViewTest, AddControllingFaninSwitchWithExistingAddedIdentity) {
   ASSERT_EQ(graph.graph()->node_size(), 3);
   NodeDef* a = graph.GetNode("a");
   ASSERT_NE(a, nullptr);
-  ASSERT_EQ(a->input_size(), 1);
-  EXPECT_EQ(a->input(0), "^ConstantFoldingCtrl/switch_0");
+  NodeDef expected = NDef("", "", {"^ConstantFoldingCtrl/switch_0"});
+  CompareNodeInputs(graph, &expected, a);
+}
+
+TEST(MutableGraphViewTest, RemoveControllingFaninMissing) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {
+          NDef("a", "NotImportant", {}, {}),
+          NDef("b", "NotImportant", {}, {}),
+          NDef("c", "NotImportant", {}, {}),
+          NDef("d", "NotImportant", {"^a", "^b"}),
+      },
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  EXPECT_FALSE(graph.RemoveControllingFanin("d", "c"));
+
+  ASSERT_EQ(graph.graph()->node_size(), 4);
+  NodeDef* d = graph.GetNode("d");
+  ASSERT_NE(d, nullptr);
+  NodeDef expected = NDef("", "", {"^a", "^b"});
+  CompareNodeInputs(graph, &expected, d);
+}
+
+TEST(MutableGraphViewTest, RemoveControllingFaninExisting) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {
+          NDef("a", "NotImportant", {}, {}),
+          NDef("b", "NotImportant", {}, {}),
+          NDef("c", "NotImportant", {}, {}),
+          NDef("d", "NotImportant", {"^a", "^b", "^c"}),
+      },
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  EXPECT_TRUE(graph.RemoveControllingFanin("d", "a"));
+  EXPECT_FALSE(graph.RemoveControllingFanin("d", "a"));
+
+  ASSERT_EQ(graph.graph()->node_size(), 4);
+  NodeDef* d = graph.GetNode("d");
+  ASSERT_NE(d, nullptr);
+  NodeDef expected = NDef("", "", {"^c", "^b"});
+  CompareNodeInputs(graph, &expected, d);
+}
+
+TEST(MutableGraphViewTest, RemoveControllingFaninOnRegularFanin) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a"}),
+       NDef("c", "NotImportant", {"a", "b", "^c"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  EXPECT_FALSE(graph.RemoveControllingFanin("c", "a"));
+  EXPECT_FALSE(graph.RemoveControllingFanin("c", "b"));
+
+  ASSERT_EQ(graph.graph()->node_size(), 3);
+  NodeDef* c = graph.GetNode("c");
+  ASSERT_NE(c, nullptr);
+  NodeDef expected = NDef("", "", {"a", "b", "^c"});
+  CompareNodeInputs(graph, &expected, c);
 }
 
 TEST(MutableGraphViewTest, DeleteNodes) {
@@ -694,9 +1024,8 @@ TEST(MutableGraphViewTest, DeleteNodes) {
   auto bar_fanouts = graph.GetFanouts(*bar, include_control_fanouts);
   auto other_fanouts = graph.GetFanouts(*other, include_control_fanouts);
 
-  EXPECT_EQ(bar_fanouts.size(), 2);
+  EXPECT_EQ(bar_fanouts.size(), 1);
   EXPECT_EQ(bar_fanouts.count(MutableGraphView::InputPort(foo_2, 1)), 1);
-  EXPECT_EQ(bar_fanouts.count(MutableGraphView::InputPort(foo_2, -1)), 1);
 
   EXPECT_EQ(other_fanouts.size(), 1);
   EXPECT_EQ(other_fanouts.count(MutableGraphView::InputPort(foo_2, 0)), 1);
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 79578cb3ce0733bcfce1a382414c20881879e3e3..7e29cee86ac1586070cc48390586720b326355be 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -151,6 +151,8 @@ cc_library(
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/utils:functions",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
@@ -252,12 +254,16 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_topology_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:symbolic_shapes",
         "//tensorflow/core/grappler/utils:topological_sort",
+        "//tensorflow/core/grappler/utils:traversal",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -409,6 +415,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_topology_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
@@ -524,10 +531,10 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/utils:colocation",
         "//tensorflow/core/grappler/utils:functions",
         "//tensorflow/core/grappler/utils:topological_sort",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -606,16 +613,18 @@ cc_library(
         ":constant_folding",
         ":evaluation_utils",
         ":graph_optimizer",
-        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_topology_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:frame",
+        "//tensorflow/core/grappler/utils:traversal",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
@@ -691,6 +700,7 @@ cc_library(
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/utils:symbolic_shapes",
         "//tensorflow/core/grappler/utils:topological_sort",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
@@ -702,6 +712,7 @@ tf_cuda_cc_test(
     deps = [
         ":remapper",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -758,7 +769,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":graph_optimizer",
-        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 94a87c3ff2a6c598d2d0afac54eb5d4a6b92fafc..e85e0473b3a70d984dcf201978e29bb845152284 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -31,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/graph_topology_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
@@ -38,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/symbolic_shapes.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/grappler/utils/traversal.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/hash/hash.h"
@@ -3334,36 +3338,37 @@ bool ArithmeticOptimizer::CanDedup(const NodeDef& node) const {
 }
 
 void ArithmeticOptimizer::DedupComputations() {
-  bool stop = true;
-  SimpleGraphView graph_view;
-  if (!graph_view.Initialize(*optimized_graph_).ok()) {
-    LOG(WARNING) << "Failed to build SimpleGraphView.";
+  GraphTopologyView graph_view;
+  if (!graph_view.InitializeFromGraph(*optimized_graph_).ok()) {
+    LOG(WARNING) << "Failed to initialize GraphTopologyView.";
     return;
   }
-  std::set<int> duplicates;
+
+  const absl::flat_hash_set<string> ops_to_traverse = {
+      "Identity", "IdentityN", "Reshape", "ExpandDims",
+      "Enter",    "Switch",    "Merge"};
+
   // Populate feed_inplace_op;
-  std::unordered_set<const NodeDef*> feeds_inplace_op;
-  for (int i = 0; i < optimized_graph_->node_size(); ++i) {
-    const NodeDef& root = optimized_graph_->node(i);
-    if (feeds_inplace_op.find(&root) != feeds_inplace_op.end()) {
-      continue;
-    }
+  absl::flat_hash_set<const NodeDef*> feeds_inplace_op;
+
+  for (const NodeDef& root : optimized_graph_->node()) {
+    if (feeds_inplace_op.find(&root) != feeds_inplace_op.end()) continue;
 
     if (ModifiesInputsInPlace(root)) {
-      const std::unordered_set<string> op_types_to_traverse = {
-          root.op(),    "Identity", "IdentityN", "Reshape",
-          "ExpandDims", "Enter",    "Switch",    "Merge"};
+      const auto is_continue_traversal = [&](const NodeDef* node) -> bool {
+        return node->op() == root.op() || ops_to_traverse.count(node->op()) > 0;
+      };
 
-      graph_view.DepthFirstSearchWithCallback(
-          op_types_to_traverse, i,
-          [&](const NodeDef& node) {
-            feeds_inplace_op.insert(&node);
-            return false;
-          },
-          SimpleGraphView::kFollowInputs /* search through inputs */);
+      DfsTraversal(graph_view, {&root}, TraversalDirection::kFollowInputs,
+                   DfsPredicates::Advance(is_continue_traversal),
+                   DfsCallbacks::PreOrder([&](const NodeDef* node) {
+                     feeds_inplace_op.insert(node);
+                   }));
     }
   }
 
+  bool stop = true;
+  std::set<int> duplicates;
   do {
     stop = true;
     UniqueNodes nodes;
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 1f24d35284cbfb8d22a75e179c3e1998e03059ff..b0c3c5b5181be4b744128fb18ac288c122c59f2a 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -901,8 +901,8 @@ DataType GetDataTypeFromNodeOrProps(const NodeDef& node,
 
 // static
 Status ConstantFolding::CreateNodeDef(const string& name,
-                                      const TensorValue& tensor,
-                                      NodeDef* node) {
+                                      const TensorValue& tensor, NodeDef* node,
+                                      size_t original_size) {
   node->set_name(name);
   node->set_op("Const");
 
@@ -980,11 +980,12 @@ Status ConstantFolding::CreateNodeDef(const string& name,
   }
   node->mutable_attr()->insert({"value", attr_tensor});
 
-  if (encoded_size < 10 * 1024 * 1024) {
-    return Status::OK();
+  if (encoded_size > original_size && encoded_size >= 10 * 1024 * 1024) {
+    return errors::InvalidArgument(
+        strings::StrCat("Can't fold ", name, ", its size would be too large (",
+                        encoded_size, " >= ", 10 * 1024 * 1024, " bytes)"));
   }
-  return errors::InvalidArgument(
-      strings::StrCat("Can't fold ", name, ", its size would be too large"));
+  return Status::OK();
 }
 
 Status ConstantFolding::EvaluateNode(const NodeDef& node,
@@ -1010,6 +1011,7 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
     }
   });
 
+  size_t total_inputs_size = 0;
   for (const auto& input : node.input()) {
     const TensorId input_tensor = ParseTensorName(input);
     if (input_tensor.index() < 0) {
@@ -1027,6 +1029,7 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
     Tensor* value = new Tensor(raw_val.dtype(), raw_val.tensor_shape());
     CHECK(value->FromProto(raw_val));
     inputs.emplace_back(value);
+    total_inputs_size += value->TotalBytes();
   }
 
   TF_RETURN_IF_ERROR(EvaluateNode(node, inputs, &output_tensors));
@@ -1041,7 +1044,8 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
       node_name = strings::StrCat(node_name, "-", i);
     }
     if (output_tensors[i].tensor) {
-      Status s = CreateNodeDef(node_name, output_tensors[i], &outputs->at(i));
+      Status s = CreateNodeDef(node_name, output_tensors[i], &outputs->at(i),
+                               total_inputs_size);
       if (!s.ok()) {
         *result_too_large = true;
         return s;
@@ -1697,12 +1701,12 @@ Status ConstantFolding::SimplifyNode(bool use_shape_info, NodeDef* node,
     return Status::OK();
   }
 
-  if (ConstantPushDown(node)) {
+  if (ConstantPushDown(optimized_graph, node)) {
     graph_modified_ = true;
     return Status::OK();
   }
 
-  if (MulConvPushDown(node, *properties)) {
+  if (MulConvPushDown(optimized_graph, node, *properties)) {
     graph_modified_ = true;
     return Status::OK();
   }
@@ -2612,7 +2616,8 @@ bool ConstantFolding::ReduceDivToReciprocalMul(GraphDef* optimized_graph,
   return false;
 }
 
-bool ConstantFolding::ConstantPushDown(NodeDef* node) {
+bool ConstantFolding::ConstantPushDown(GraphDef* optimized_graph,
+                                       NodeDef* node) {
   // Consider the transformation
   //
   //                      +                +       = parent
@@ -2680,10 +2685,10 @@ bool ConstantFolding::ConstantPushDown(NodeDef* node) {
       // edge. We can replace such a control edge with a control edge from A
       // to C.
       CHECK(MaybeRemoveControlInput(op_child_node->name(), const_child_node,
-                                    graph_, node_map_.get()));
+                                    optimized_graph, node_map_.get()));
       string other_leaf_input = left_leaf_is_constant ? op_child_node->input(0)
                                                       : op_child_node->input(1);
-      MaybeAddControlInput(other_leaf_input, const_child_node, graph_,
+      MaybeAddControlInput(other_leaf_input, const_child_node, optimized_graph,
                            node_map_.get());
     }
 
@@ -2700,7 +2705,7 @@ bool ConstantFolding::ConstantPushDown(NodeDef* node) {
   return false;
 }
 
-bool ConstantFolding::MulConvPushDown(NodeDef* node,
+bool ConstantFolding::MulConvPushDown(GraphDef* optimized_graph, NodeDef* node,
                                       const GraphProperties& properties) {
   // Push down multiplication on ConvND.
   //                       *                  ConvND
@@ -2792,12 +2797,13 @@ bool ConstantFolding::MulConvPushDown(NodeDef* node,
     }
     // Make sure we don't introduce loops in the graph by removing control
     // dependencies from the conv2d node to c2.
-    NodeDef* conv_const_node =
-        conv_left_is_constant ? conv_left_child : conv_right_child;
-    if (MaybeRemoveControlInput(conv_node->name(), const_node, graph_,
+    string conv_const_input =
+        conv_left_is_constant ? conv_node->input(0) : conv_node->input(1);
+    if (MaybeRemoveControlInput(conv_node->name(), const_node, optimized_graph,
                                 node_map_.get())) {
       // Add a control dep from c1 to c2 to ensure c2 is in the right frame
-      *const_node->add_input() = AsControlDependency(*conv_const_node);
+      MaybeAddControlInput(conv_const_input, const_node, optimized_graph,
+                           node_map_.get());
     }
 
     conv_node->set_name(node->name());
@@ -2809,6 +2815,8 @@ bool ConstantFolding::MulConvPushDown(NodeDef* node,
       node_map_->UpdateInput(conv_node->name(), node->input(1), mul_new_name);
       conv_node->set_input(1, mul_new_name);
     }
+    NodeDef* conv_const_node =
+        conv_left_is_constant ? conv_left_child : conv_right_child;
     if (left_child_is_constant) {
       node->set_input(1, conv_const_node->name());
     } else {
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index 0b778882d7d4d89d83de5d6bd5a6f9c827cf5bf8..99200925cb351478bd188361c33b88634caffa26 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -35,8 +35,10 @@ const char kConstantFoldingCtrl[] = "ConstantFoldingCtrl";
 // Constant folding optimization for a graph.
 class ConstantFolding : public GraphOptimizer {
  public:
+  // The size limit will only be considered if the newly created node is greater
+  // than original_size (optional).
   static Status CreateNodeDef(const string& name, const TensorValue& tensor,
-                              NodeDef* node);
+                              NodeDef* node, size_t original_size = 0);
   static string AddControlDependency(const string& input_name, GraphDef* graph,
                                      NodeMap* node_map);
 
@@ -124,11 +126,12 @@ class ConstantFolding : public GraphOptimizer {
 
   // Pushes down constants on '+' and '*' operators if applicable. Returns true
   // the transformation applied successfully.
-  bool ConstantPushDown(NodeDef* node);
+  bool ConstantPushDown(GraphDef* optimized_graph, NodeDef* node);
 
   // Aggregate constants present around a conv operator. Returns true if the
   // transformation was applied successfully.
-  bool MulConvPushDown(NodeDef* node, const GraphProperties& properties);
+  bool MulConvPushDown(GraphDef* optimized_graph, NodeDef* node,
+                       const GraphProperties& properties);
 
   // Strength reduces floating point division by a constant Div(x, const) to
   // multiplication by the reciprocal Mul(x, Reciprocal(const)).
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 192f48272f9ed08b2b6424f3c8e33d1afafdb56d..d7cabf5a8b8ad6659937e868df7635292936d48c 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -1601,7 +1601,7 @@ TEST_F(ConstantFoldingTest, SplitRemoval) {
   AddNode("split_dim", "Const", {}, {}, &want);
   AddNode("s1", "Identity", {"in1", AsControlDependency("split_dim")}, {},
           &want);
-  AddNode("s2", "Split", {"in2", "split_dim"}, {}, &want);
+  AddNode("s2", "Split", {"split_dim", "in2"}, {}, &want);
   AddNode("out", "Add", {"s1", "s2"}, {}, &want);
 
   CompareGraphs(want, got);
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index 7593023ff4d649c623db9be98ac52ef6b799219f..57b0df3cb2d4bf0e3590e3cc436f73cf914ad635 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -3,6 +3,30 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_protos_all")
 
+cc_library(
+    name = "meta_optimizer",
+    srcs = ["meta_optimizer.cc"],
+    hdrs = [
+        "meta_optimizer.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@com_google_absl//absl/container:flat_hash_map",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:arithmetic_optimizer",
+        "//tensorflow/core/grappler/optimizers:model_pruner",
+        "//tensorflow/core/grappler/optimizers:shape_optimizer",
+        "//tensorflow/core/grappler/optimizers:dependency_optimizer",
+        "//tensorflow/core/grappler/optimizers:function_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:ptr_util",
+    ] + tf_protos_all(),
+)
+
 cc_library(
     name = "filter_fusion",
     srcs = ["filter_fusion.cc"],
@@ -561,6 +585,7 @@ cc_library(
         ":map_fusion",
         ":map_parallelization",
         ":map_vectorization",
+        ":meta_optimizer",
         ":noop_elimination",
         ":shuffle_and_repeat_fusion",
     ],
diff --git a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
index 9d8b388a3a8bca1fb560e5acc94d50f3d82ed30d..202dfb5ac8f038f686e51c9a671f89eac5cba73b 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
@@ -42,7 +42,7 @@ NodeDef MakeMapAndBatchNode(StringPiece name, StringPiece input_node_name,
                             StringPiece function_name) {
   return test::function::NDef(
       name, "ExperimentalMapAndBatchDataset",
-      {string(input_node_name), "", string(batch_size_node_name),
+      {string(input_node_name), string(batch_size_node_name),
        string(num_parallel_calls_node_name), string(drop_remainder_node_name)},
       {{"f", FunctionDefHelper::FunctionRef(string(function_name))},
        {"Targuments", {}},
@@ -68,7 +68,7 @@ NodeDef MakeParallelInterleaveNode(StringPiece name,
                                    StringPiece function_name, bool sloppy) {
   return test::function::NDef(
       name, "ParallelInterleaveDatasetV2",
-      {string(input_node_name), "", string(cycle_length_node_name),
+      {string(input_node_name), string(cycle_length_node_name),
        string(block_length_node_name), string(num_parallel_calls_node_name)},
       {
           {"f", FunctionDefHelper::FunctionRef(string(function_name))},
diff --git a/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc b/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc
index d428d04a66659cd3b961428e3762ea3ab81ad69e..426c1dca5bb2c112d47b440a672b5a720a994cdf 100644
--- a/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc
@@ -30,9 +30,9 @@ TEST(LatencyAllEdgesTest, AddLatenciesAfterTensorMapPrefetch) {
   using test::function::NDef;
   GrapplerItem item;
   NodeDef component_node =
-      NDef("component_nodes", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}});
+      NDef("component_node", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}});
   NodeDef from_tensor_node =
-      NDef("from_tensor_nodes", "TensorDataset", {"component_nodes"},
+      NDef("from_tensor_node", "TensorDataset", {"component_node"},
            {{"Toutput_types", {}}, {"output_shapes", {}}});
 
   NodeDef captured_input_node = NDef("captured_input_node", "Const", {},
diff --git a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0fc0cf43c62f43760ec9e8de959a6b93c3cea8a7
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
@@ -0,0 +1,125 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/meta_optimizer.h"
+
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/dependency_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/function_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/model_pruner.h"
+#include "tensorflow/core/grappler/optimizers/shape_optimizer.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace grappler {
+
+Status TFDataMetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+                                     GraphDef* output) {
+  // Stores the optimized item so far.
+  GrapplerItem optimized_item = item;
+
+  // Perform optimizations in a meaningful order.
+  for (const auto& optimization :
+       {"noop_elimination",
+        "shuffle_and_repeat_fusion",
+        "map_fusion",
+        "filter_fusion",
+        "map_and_filter_fusion",
+        "hoist_random_uniform",
+        "map_parallelization",
+        "map_and_batch_fusion",
+        "map_vectorization",
+        "make_numa_aware",
+        "latency_all_edges",
+        "make_sloppy",
+        "pruning",
+        "function",
+        "shape",
+        "arithmetic",
+        "dependency"}) {
+    TF_RETURN_IF_ERROR(
+        ApplyOptimization(optimization, cluster, &optimized_item));
+  }
+
+  // Store the final result of all the optimizations in `output`.
+  output->Swap(&optimized_item.graph);
+  return Status::OK();
+}
+
+Status TFDataMetaOptimizer::ApplyOptimization(const string& name,
+                                              Cluster* cluster,
+                                              GrapplerItem* item) const {
+  GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
+
+  const auto* optimizer = gtl::FindOrNull(enabled_optimizers_, name);
+  if (!optimizer) {
+    return Status::OK();
+  }
+
+  GraphDef result;
+  (*optimizer)->set_deadline_usec(this->deadline_usec());
+  TF_RETURN_IF_ERROR((*optimizer)->Optimize(cluster, *item, &result));
+  item->graph.Swap(&result);
+
+  return Status::OK();
+}
+
+Status TFDataMetaOptimizer::Init(
+    const tensorflow::RewriterConfig_CustomGraphOptimizer* config) {
+  if (!config) return Status::OK();
+
+  // Initialize custom tf.data optimizers based on config.
+  auto& optimizers = config->parameter_map().at("optimizers").list().s();
+  for (const auto& optimizer_name : optimizers) {
+    auto optimizer =
+        CustomGraphOptimizerRegistry::CreateByNameOrNull(optimizer_name);
+    if (optimizer) {
+      // None of our data optimizers implement a meaningful Init function.
+      // This returns an error in case any of them does.
+      TF_RETURN_IF_ERROR(optimizer->Init());
+      enabled_optimizers_[optimizer_name] = std::move(optimizer);
+    } else {
+      // This should never happen.
+      return errors::Internal(
+          "Tried to register a dataset optimizer that doesn't exist: ",
+          optimizer_name);
+    }
+  }
+
+  // Initialize standard grappler optimizers.
+  enabled_optimizers_["pruning"] = MakeUnique<ModelPruner>();
+  enabled_optimizers_["function"] =
+      MakeUnique<FunctionOptimizer>(RewriterConfig::ON);
+  enabled_optimizers_["shape"] = MakeUnique<ShapeOptimizer>();
+  enabled_optimizers_["arithmetic"] = MakeUnique<ArithmeticOptimizer>();
+  enabled_optimizers_["dependency"] = MakeUnique<DependencyOptimizer>();
+
+  return Status::OK();
+}
+
+void TFDataMetaOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
+                                   const GraphDef& optimize_output,
+                                   double result) {
+  // no-op
+}
+
+REGISTER_GRAPH_OPTIMIZER_AS(TFDataMetaOptimizer, "tf_data_meta_optimizer");
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/meta_optimizer.h b/tensorflow/core/grappler/optimizers/data/meta_optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..c39ddda4cb4512bda923723d06811afaf1f3869a
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/meta_optimizer.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_META_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_META_OPTIMIZER_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// This optimizer performs tf.data-specific optimizations by invoking
+// other optimizers.
+class TFDataMetaOptimizer : public CustomGraphOptimizer {
+ public:
+  TFDataMetaOptimizer() = default;
+  ~TFDataMetaOptimizer() override = default;
+
+  string name() const override { return "tf_data_meta_optimizer"; };
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override;
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* output) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+
+ private:
+  absl::flat_hash_map<string, std::unique_ptr<GraphOptimizer>>
+      enabled_optimizers_;
+
+  // Applies an optimization with the specified name on `item`, and stores
+  // the result in `item.graph`
+  Status ApplyOptimization(const string& name, Cluster* cluster,
+                           GrapplerItem* item) const;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_META_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index 7fee3ae9d51bcdb234945a6000985fb5531000a0..8b81cb2430ca9a34926217312f2894cf283c1dd2 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -205,14 +205,6 @@ bool DependencyOptimizer::BypassingNodeIsBeneficial(
     num_cross_out += static_cast<int>(output_node->device() != node_dev);
   }
 
-  if ((is_identity || is_multi_input_identity_n) && num_cross_in > 0 &&
-      num_cross_out > 0) {
-    // This identity node follows a device crossing, so it might be
-    // following a _Recv node after partioning. Do not remove such nodes,
-    // unless they only have consumers on the same device as themselves.
-    return false;
-  }
-
   // Make sure we do not increase the number of device crossings.
   const int num_cross_before = num_cross_in + num_cross_out;
   int num_cross_after = 0;
@@ -225,6 +217,15 @@ bool DependencyOptimizer::BypassingNodeIsBeneficial(
   if (num_cross_after > num_cross_before) {
     return false;
   }
+
+  if ((is_identity || is_multi_input_identity_n) && num_cross_in > 0 &&
+      num_cross_out > 0 && num_cross_after > 0) {
+    // This identity node follows a device crossing, so it might be
+    // following a _Recv node after partioning. Do not remove such nodes,
+    // unless they only have consumers on the same device as themselves.
+    return false;
+  }
+
   return true;
 }
 
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
index 8d70d9d5c73690e87d84cf941c749948e47ace26..5883fcb92681f13c0f1d7f4d623b409274d6f962 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
@@ -356,6 +356,32 @@ TEST_F(DependencyOptimizerTest, RemoveIdentityOps_DeviceBoundaries) {
   VerifyGraphsEqual(item.graph, output, __FUNCTION__);
 }
 
+TEST_F(DependencyOptimizerTest, RemoveIdentityOps_IdenticalDevices) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::RandomUniform(s.WithOpName("x").WithDevice("/CPU:0"), {1, 2},
+                                DT_FLOAT);
+  auto id_a = ops::Identity(s.WithOpName("id_a").WithDevice("/CPU:1"), x);
+  Output id =
+      ops::Identity(s.WithControlDependencies(id_a).WithDevice("/CPU:0"), id_a);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch.push_back("Identity");
+
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(item.graph.node_size() - 1, output.node_size());
+  for (const NodeDef& node : output.node()) {
+    EXPECT_NE(node.name(), "id_a");
+    if (node.name() == "Identity") {
+      EXPECT_EQ(node.input(0), "x");
+    }
+  }
+}
+
 TEST_F(DependencyOptimizerTest, RemoveNoOps_SingleInputOrOutput) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output x = ops::RandomUniform(s.WithOpName("x"), {1, 2}, DT_FLOAT);
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 73c950b3fce5039e3789873b2d0fc7c515be6bcd..e9c30fee25cf54c8aa956ea13fbecead87e6d25e 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -15,10 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/function_optimizer.h"
 
-#include <unordered_map>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/substitute.h"
@@ -163,10 +164,10 @@ struct FunctionSpecializationSignature {
 
   string func_name;
   bool is_in_fetch_set;
-  gtl::FlatSet<OutputPort> active_outputs;
-  std::unordered_map<string, DataType> type_parameters;
-  std::unordered_map<string, AttrValue> body_parameters;
-  std::unordered_map<InputPort, string> const_inputs;
+  absl::flat_hash_set<OutputPort> active_outputs;
+  absl::flat_hash_map<string, DataType> type_parameters;
+  absl::flat_hash_map<string, AttrValue> body_parameters;
+  absl::flat_hash_map<InputPort, string> const_inputs;
 
   bool operator==(const FunctionSpecializationSignature& other) const {
     bool equals = func_name == other.func_name &&
@@ -189,48 +190,45 @@ struct FunctionSpecializationSignature {
     return true;
   }
 
-  // TODO(ezhulenev): Migrate to AbslHashValue.
-  // TODO(ezhulenev): Optimize performance by computing hashes of unordered
-  // values first, and then compute a hash of sorted hashes.
-  struct Hash {
-    uint64 operator()(FunctionSpecializationSignature const& s) const {
-      uint64 h = Hash64(s.func_name);
-      h = Hash64Combine(std::hash<bool>()(s.is_in_fetch_set), h);
-
-      // Use std::set/std::map for deterministic iteration order.
-
-      std::set<OutputPort> active_outputs(s.active_outputs.begin(),
-                                          s.active_outputs.end());
-      for (const auto& active_output : active_outputs) {
-        h = Hash64Combine(std::hash<int>()(active_output), h);
-      }
-
-      std::map<string, DataType> types(s.type_parameters.begin(),
-                                       s.type_parameters.end());
-      for (const auto& pair : types) {
-        AttrValue attr_value;
-        attr_value.set_type(pair.second);
-        h = Hash64Combine(Hash64(pair.first), h);
-        h = Hash64Combine(AttrValueHash(attr_value), h);
-      }
-
-      std::map<string, AttrValue> body(s.body_parameters.begin(),
-                                       s.body_parameters.end());
-      for (const auto& pair : body) {
-        h = Hash64Combine(Hash64(pair.first), h);
-        h = Hash64Combine(FastAttrValueHash(pair.second), h);
-      }
-
-      std::map<InputPort, string> inputs(s.const_inputs.begin(),
-                                         s.const_inputs.end());
-      for (const auto& pair : inputs) {
-        h = Hash64Combine(std::hash<int>()(pair.first), h);
-        h = Hash64Combine(Hash64(pair.second), h);
-      }
-
-      return h;
-    }
-  };
+  template <typename H>
+  friend H AbslHashValue(H h, const FunctionSpecializationSignature& s) {
+    H base = H::combine(std::move(h), s.func_name, s.is_in_fetch_set);
+
+    // First pre-compute hashes for all values in collections with
+    // non-deterministic iteration order.
+    std::vector<uint64> hashes;
+    hashes.reserve(s.active_outputs.size()         //
+                   + s.type_parameters.size() * 2  //
+                   + s.body_parameters.size() * 2  //
+                   + s.const_inputs.size() * 2);
+
+    absl::c_transform(s.active_outputs, std::back_inserter(hashes),
+                      hash<OutputPort>());
+
+    using TypeParam = std::pair<const string, DataType>;
+    absl::c_for_each(s.type_parameters, [&hashes](const TypeParam& type_param) {
+      AttrValue attr_value;
+      attr_value.set_type(type_param.second);
+      hashes.push_back(Hash64(type_param.first));
+      hashes.push_back(AttrValueHash(attr_value));
+    });
+
+    using BodyParam = std::pair<const string, AttrValue>;
+    absl::c_for_each(s.body_parameters, [&hashes](const BodyParam& body_param) {
+      hashes.push_back(Hash64(body_param.first));
+      hashes.push_back(FastAttrValueHash(body_param.second));
+    });
+
+    using ConstInput = std::pair<const InputPort, string>;
+    absl::c_for_each(s.const_inputs, [&hashes](const ConstInput& const_input) {
+      hashes.push_back(hash<InputPort>()(const_input.first));
+      hashes.push_back(Hash64(const_input.second));
+    });
+
+    // Combine all pre-computed hashes in a deterministic order.
+    absl::c_sort(hashes);
+    return H::combine_contiguous(std::move(base), hashes.data(), hashes.size());
+  }
 };
 
 struct FunctionSpecialization {
@@ -238,13 +236,13 @@ struct FunctionSpecialization {
   // True if the function caller node is in GrapplerItem fetch set.
   bool is_in_fetch_set;
   // Names of the tensors that were pushed down into the function body.
-  gtl::FlatSet<string> const_inputs;
+  absl::flat_hash_set<string> const_inputs;
   // Control dependencies of pushed down const inputs have to be attached to
   // function caller node.
-  gtl::FlatSet<string> control_deps;
+  absl::flat_hash_set<string> control_deps;
   // Output tensors (ports) that consumed by other nodes in the graph or in a
   // GrapplerItem fetch set.
-  gtl::FlatSet<int> active_outputs;
+  absl::flat_hash_set<int> active_outputs;
   // Mapping from original function output port to the output port of
   // specialized function. If function specialization changes the number of
   // function outputs it's required to update all node consumers.
@@ -285,12 +283,13 @@ class FunctionOptimizerContext {
     return flr_;
   }
 
-  const gtl::FlatMap<SafeTensorId, SafeTensorId, SafeTensorId::Hasher>&
+  const absl::flat_hash_map<SafeTensorId, SafeTensorId, SafeTensorId::Hasher>&
   tensor_mapping() const {
     return tensor_mapping_;
   }
 
-  const gtl::FlatMap<string, std::vector<string>>& control_overrides() const {
+  const absl::flat_hash_map<string, std::vector<string>>& control_overrides()
+      const {
     return control_overrides_;
   }
 
@@ -298,7 +297,9 @@ class FunctionOptimizerContext {
 
   const string& grappler_item_id() const { return grappler_item_id_; }
 
-  const gtl::FlatSet<string>& fetch_tensors() const { return fetch_tensors_; }
+  const absl::flat_hash_set<string>& fetch_tensors() const {
+    return fetch_tensors_;
+  }
 
   const DeviceSet* devices() const {
     // Create fake devices lazily only if we need a DeviceSet.
@@ -365,7 +366,7 @@ class FunctionOptimizerContext {
 
  private:
   void InitializeTrulyConstNodes(const GrapplerItem& item) {
-    gtl::FlatSet<string> feed_nodes;
+    absl::flat_hash_set<string> feed_nodes;
     for (const auto& feed : item.feed) {
       feed_nodes.insert(NodeName(feed.first));
     }
@@ -411,7 +412,7 @@ class FunctionOptimizerContext {
   FunctionLibraryRuntime* flr_ = nullptr;
 
   // Fully defined names of the devices available to the GrapplerItem.
-  const gtl::FlatSet<string> available_device_names_;
+  const absl::flat_hash_set<string> available_device_names_;
 
   // List of available `FakedDevices` (lazily initialized, see devices()).
   mutable std::vector<std::unique_ptr<Device>> available_devices_;
@@ -421,16 +422,15 @@ class FunctionOptimizerContext {
   mutable DeviceSet available_device_set_;
 
   // Nodes that are Const and not in feed.
-  std::unordered_map<string, const NodeDef*> truly_const_nodes_;
+  absl::flat_hash_map<string, const NodeDef*> truly_const_nodes_;
   // Specialized functions.
-  std::unordered_map<FunctionSpecializationSignature,
-                     const FunctionSpecialization,
-                     FunctionSpecializationSignature::Hash>
+  absl::flat_hash_map<FunctionSpecializationSignature,
+                      const FunctionSpecialization>
       specialized_functions_;
 
   // GrapplerItem.fetch is a vector of tensors.
-  gtl::FlatSet<string> fetch_tensors_;  // format: node_name:port
-  gtl::FlatSet<string> fetch_nodes_;    // format: node_name
+  absl::flat_hash_set<string> fetch_tensors_;  // format: node_name:port
+  absl::flat_hash_set<string> fetch_nodes_;    // format: node_name
 
   // After function inlining and specialization, the optimized graph might be in
   // invalid state, nodes can read from non-existing function call nodes that
@@ -439,7 +439,7 @@ class FunctionOptimizerContext {
   //
   // Tensor mapping that has to be applied to the graph after all functions
   // optimizations (invalidated tensor id -> optimized graph tensor id).
-  gtl::FlatMap<SafeTensorId, SafeTensorId, SafeTensorId::Hasher>
+  absl::flat_hash_map<SafeTensorId, SafeTensorId, SafeTensorId::Hasher>
       tensor_mapping_;
 
   // When we inline a function into the optimized graph, we no longer have the
@@ -448,7 +448,7 @@ class FunctionOptimizerContext {
   // to all side-effectful ops inside the function body.
   //
   // Invalidated function call node name -> Inlined side-effectful nodes
-  gtl::FlatMap<string, std::vector<string>> control_overrides_;
+  absl::flat_hash_map<string, std::vector<string>> control_overrides_;
 
   // Use graph view to find active outputs of the function caller nodes.
   GraphView graph_view_;
@@ -472,10 +472,10 @@ const FunctionDef* FindFunctionCall(const FunctionOptimizerContext& ctx,
   return ctx.function_library().Find(node.op());
 }
 
-gtl::FlatSet<int> GetActiveOutputs(const NodeDef& node,
-                                   const FunctionOptimizerContext& ctx,
-                                   int size_hint = 0) {
-  gtl::FlatSet<int> active_outputs;
+absl::flat_hash_set<int> GetActiveOutputs(const NodeDef& node,
+                                          const FunctionOptimizerContext& ctx,
+                                          int size_hint = 0) {
+  absl::flat_hash_set<int> active_outputs;
   active_outputs.reserve(static_cast<size_t>(size_hint));
 
   // 1. Output can be consumed by the other graph node.
@@ -508,7 +508,7 @@ bool HasUnusedOutputs(const NodeDef& func_node, const FunctionDef& func,
   // number of output args is the same as number of possible function caller
   // node outputs.
   int num_outputs = func.signature().output_arg_size();
-  const gtl::FlatSet<int> active_outputs =
+  const absl::flat_hash_set<int> active_outputs =
       GetActiveOutputs(func_node, ctx, /*size_hind*/ num_outputs);
 
   return active_outputs.size() != num_outputs;
@@ -519,7 +519,7 @@ bool HasUnusedOutputs(const NodeDef& func_node, const FunctionDef& func,
 FunctionDefLibrary PruneFunctionLibrary(const FunctionLibraryDefinition& flib,
                                         const GraphDef& optimized_graph) {
   FunctionLibraryDefinition pruned_flib =
-      ReachableFunctionLibraryDefinition(flib, optimized_graph);
+      flib.ReachableDefinitions(optimized_graph);
 
   int pruned_functions = static_cast<int>(pruned_flib.num_functions()) -
                          static_cast<int>(flib.num_functions());
@@ -534,8 +534,8 @@ FunctionDefLibrary PruneFunctionLibrary(const FunctionLibraryDefinition& flib,
 Status PushDownConstInputs(const NodeDef& func_node,
                            const FunctionOptimizerContext& ctx,
                            GrapplerFunctionItem* item,
-                           gtl::FlatSet<string>* const_inputs,
-                           gtl::FlatSet<string>* control_deps) {
+                           absl::flat_hash_set<string>* const_inputs,
+                           absl::flat_hash_set<string>* control_deps) {
   // Record node control dependencies in the control_deps set.
   const auto record_control_deps = [&](const NodeDef* const_input) {
     for (int i = const_input->input_size() - 1; i >= 0; --i) {
@@ -585,7 +585,7 @@ void RemovePushedDownConstInputs(const FunctionSpecialization& specialization,
 
   // Attach control dependencies of pushed down const input to the caller node.
   if (!specialization.control_deps.empty()) {
-    gtl::FlatSet<string> existing_control_deps;
+    absl::flat_hash_set<string> existing_control_deps;
 
     for (const string& input : keep_inputs) {
       existing_control_deps.insert(AsControlDependency(NodeName(input)));
@@ -797,8 +797,8 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
 
   // Push const inputs into the function body, and keep track of their control
   // dependencies.
-  gtl::FlatSet<string> const_inputs;
-  gtl::FlatSet<string> control_deps;
+  absl::flat_hash_set<string> const_inputs;
+  absl::flat_hash_set<string> control_deps;
   TF_RETURN_IF_ERROR(PushDownConstInputs(func_node, *ctx, &item, &const_inputs,
                                          &control_deps));
 
@@ -806,8 +806,17 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
   // update outputs for the fetch nodes, so we just skip them.
   std::vector<std::pair<int, int>> output_mapping;
   if (!signature.is_in_fetch_set) {
-    TF_RETURN_IF_ERROR(
-        RemoveUnusedOutputs(signature.active_outputs, &item, &output_mapping));
+    int num_func_outputs = 0;
+    for (const auto& out_arg : item.outputs()) {
+      num_func_outputs += out_arg.output_nodes.size();
+    }
+
+    absl::flat_hash_set<int> remove;
+    for (int i = 0; i < num_func_outputs; ++i) {
+      if (!signature.active_outputs.count(i)) remove.insert(i);
+    }
+
+    TF_RETURN_IF_ERROR(RemoveFunctionOutputs(remove, &item, &output_mapping));
   }
 
   // TODO(ezhulenev): Push down known input shapes.
@@ -962,8 +971,10 @@ NodeDef InlinedFunctionInputsNode(const NodeDef& func_node,
 
 // Create an IdentityN node to hook the function outputs to: this ensures that
 // the function body is fully evaluated before its fanout gets scheduled.
-NodeDef InlinedFunctionOutputsNode(const NodeDef& func_node,
-                                   const GrapplerFunctionItem& item) {
+NodeDef InlinedFunctionOutputsNode(
+    const NodeDef& func_node, const GrapplerFunctionItem& item,
+    const absl::flat_hash_map<absl::string_view, absl::string_view>
+        output_tensors) {
   NodeDef outputs;
   outputs.set_name(func_node.name());
   outputs.set_op("IdentityN");
@@ -972,7 +983,8 @@ NodeDef InlinedFunctionOutputsNode(const NodeDef& func_node,
       (*outputs.mutable_attr())["T"].mutable_list();
 
   for (const OutputArgExpansion& output_arg : item.outputs()) {
-    for (const string& output_tensor : output_arg.output_tensors) {
+    for (const string& output_node : output_arg.output_nodes) {
+      const absl::string_view output_tensor = output_tensors.at(output_node);
       type_list->add_type(output_arg.data_type);
       outputs.add_input(strings::StrCat(func_node.name(), "/", output_tensor));
     }
@@ -1004,29 +1016,51 @@ Status InlineDirectFunctionCall(const NodeDef& func_node,
   }
 
   // Mapping from input placeholder name to function input position.
-  int idx = 0;
-  std::unordered_map<string, int> input_placeholders_idx;
+  absl::flat_hash_map<absl::string_view, int> input_placeholders_idx;
   for (const InputArgExpansion& input_arg : item.inputs()) {
     for (const string& placeholder : input_arg.placeholders) {
-      input_placeholders_idx[placeholder] = idx++;
+      const int idx = input_placeholders_idx.size();
+      input_placeholders_idx[placeholder] = idx;
     }
   }
 
+  // Bypass identity nodes added to the graph in place of function outputs.
+  absl::flat_hash_set<absl::string_view> output_nodes;
+  for (const OutputArgExpansion& output_arg : item.outputs()) {
+    for (const string& output_node : output_arg.output_nodes) {
+      output_nodes.insert(output_node);
+    }
+  }
+
+  // For each function output value we added an identity node that reads the
+  // tensor from one of the function body nodes. When we inline function into
+  // the main graph we want to bypass these nodes, so we keep a mapping from
+  // 'output node name' -> 'output tensor name'.
+  absl::flat_hash_map<absl::string_view, absl::string_view> output_tensors;
+
   // Hook inlined function inputs to IdentityN node.
   NodeDef* func_inputs = optimized_graph->add_node();
   *func_inputs = InlinedFunctionInputsNode(func_node, item);
 
   for (NodeDef& func_body_node : *item.mutable_function_body().mutable_node()) {
-    if (item.IsInputPlaceholder(func_body_node.name())) {
-      // Turn input placeholders into identity nodes.
+    const string& node_name = func_body_node.name();
+
+    // Skip output identity node, and update a mapping to the output tensor.
+    if (IsIdentity(func_body_node) && output_nodes.count(node_name)) {
+      output_tensors.emplace(node_name, func_body_node.input(0));
+      continue;
+    }
+
+    // Turn placeholders added in place of input arguments into identity nodes.
+    const auto input_placeholder_idx = input_placeholders_idx.find(node_name);
+    if (input_placeholder_idx != input_placeholders_idx.end()) {
       CHECK_EQ(0, func_body_node.input_size());
       func_body_node.set_op("Identity");
       (*func_body_node.mutable_attr())["T"] = func_body_node.attr().at("dtype");
       func_body_node.mutable_attr()->erase("dtype");
       func_body_node.mutable_attr()->erase("shape");
-      int input_idx = input_placeholders_idx[func_body_node.name()];
-      func_body_node.add_input(
-          strings::StrCat(func_inputs->name(), ":", input_idx));
+      func_body_node.add_input(strings::StrCat(func_inputs->name(), ":",
+                                               input_placeholder_idx->second));
     } else {
       // Update the input names if any.
       for (string& input : *func_body_node.mutable_input()) {
@@ -1082,9 +1116,12 @@ Status InlineDirectFunctionCall(const NodeDef& func_node,
     }
   }
 
+  DCHECK(output_tensors.size() == item.output_size())
+      << "Each function output must be mapped to an output tensor";
+
   // Hook inlined function outputs to IdentityN node.
   NodeDef* func_outputs = optimized_graph->add_node();
-  *func_outputs = InlinedFunctionOutputsNode(func_node, item);
+  *func_outputs = InlinedFunctionOutputsNode(func_node, item, output_tensors);
 
   return Status::OK();
 }
@@ -1363,11 +1400,11 @@ Status InlineIndirectFunctionCall(const NodeDef& func_node,
   }
 
   // Mapping from input placeholder name to function input position.
-  int idx = 0;
   absl::flat_hash_map<absl::string_view, int> input_placeholders_idx;
   for (const InputArgExpansion& input_arg : item.inputs()) {
     for (const string& placeholder : input_arg.placeholders) {
-      input_placeholders_idx[placeholder] = idx++;
+      const int idx = input_placeholders_idx.size();
+      input_placeholders_idx[placeholder] = idx;
     }
   }
 
@@ -1378,8 +1415,11 @@ Status InlineIndirectFunctionCall(const NodeDef& func_node,
   // same device as their corresponding input nodes.
 
   for (NodeDef& func_body_node : *item.graph.mutable_node()) {
-    if (item.IsInputPlaceholder(func_body_node.name())) {
-      const int input_idx = input_placeholders_idx[func_body_node.name()];
+    const auto input_placeholder_idx =
+        input_placeholders_idx.find(func_body_node.name());
+
+    if (input_placeholder_idx != input_placeholders_idx.end()) {
+      const int input_idx = input_placeholder_idx->second;
       const GraphView::OutputPort output_port =
           ctx->graph_view().GetRegularFanin({&func_node, input_idx});
 
@@ -1441,16 +1481,23 @@ Status InlineIndirectFunctionCall(const NodeDef& func_node,
   // optimized graph: turn placeholders into identities, update nodes
   // connectivity, etc...
 
+  const auto inlined_node_name = [&func_node](const string& name) -> string {
+    return AddPrefixToNodeName(name, /*prefix=*/func_node.name());
+  };
+
   for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) {
-    if (item.IsInputPlaceholder(func_body_node.name())) {
-      // Turn input placeholders into identity node.
+    const string& node_name = func_body_node.name();
+
+    // Turn placeholders added in place of input arguments into identity nodes.
+    const auto input_placeholder_idx = input_placeholders_idx.find(node_name);
+    if (input_placeholder_idx != input_placeholders_idx.end()) {
       DCHECK_EQ(0, func_body_node.input_size());
       func_body_node.set_op("Identity");
       (*func_body_node.mutable_attr())["T"] = func_body_node.attr().at("dtype");
       func_body_node.mutable_attr()->erase("dtype");
       func_body_node.mutable_attr()->erase("shape");
-      const int input_idx = input_placeholders_idx[func_body_node.name()];
-      func_body_node.add_input(strings::StrCat(inputs[input_idx].ToString()));
+      const int input_idx = input_placeholder_idx->second;
+      func_body_node.add_input(inputs[input_idx].ToString());
 
       // All side effects must happen before inputs can start executing.
       for (const string& hb_node : happens_before) {
@@ -1460,7 +1507,7 @@ Status InlineIndirectFunctionCall(const NodeDef& func_node,
     } else {
       // Update inputs of the regular function body nodes.
       for (string& input : *func_body_node.mutable_input()) {
-        input = AddPrefixToNodeName(input, /*prefix=*/func_node.name());
+        input = inlined_node_name(input);
       }
       if (func_body_node.input_size() == 0 && !empty_inputs_hook.empty()) {
         *func_body_node.add_input() = empty_inputs_hook[0];
@@ -1494,7 +1541,7 @@ Status InlineIndirectFunctionCall(const NodeDef& func_node,
   for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) {
     if (!IsFreeOfSideEffect(func_body_node, &ctx->function_library())) {
       int num_fanouts = placed_graph_view.NumFanouts(
-          func_body_node, /*include_controlling_nodes=*/true);
+          func_body_node, /*include_controlled_nodes=*/true);
 
       // If the node doesn't have any outgoing edges and we do not have any
       // nodes in the `happens_after` set, we can't inline a function and
@@ -1514,11 +1561,36 @@ Status InlineIndirectFunctionCall(const NodeDef& func_node,
     }
   }
 
+  // Identity nodes added to the function body in place of function outputs.
+  absl::flat_hash_set<string> output_nodes;
+  for (const OutputArgExpansion& output_arg : item.outputs()) {
+    for (const string& output_node : output_arg.output_nodes) {
+      output_nodes.insert(inlined_node_name(output_node));
+    }
+  }
+
+  // For each function output value we added an identity node that reads the
+  // tensor from one of the function body nodes. When we inline function into
+  // the main graph we want to bypass these nodes, so we keep a mapping from
+  // 'output node name' -> 'output tensor name'.
+  absl::flat_hash_map<string, string> output_tensors;
+
   // Move all the nodes to the optimized graph after successful preprocessing.
   for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) {
+    const string& node_name = func_body_node.name();
+
+    // Skip output identity node, and add a mapping to the output tensor.
+    if (IsIdentity(func_body_node) && output_nodes.count(node_name)) {
+      output_tensors.emplace(node_name, func_body_node.input(0));
+      continue;
+    }
+
     optimized_graph->add_node()->Swap(&func_body_node);
   }
 
+  DCHECK(output_tensors.size() == item.output_size())
+      << "Each function output must be mapped to an output tensor";
+
   // TODO(ezhulenev): Inline nested indirect function calls.
 
   // Indirect function call is fully inlined into the optimized graph, and we do
@@ -1526,10 +1598,13 @@ Status InlineIndirectFunctionCall(const NodeDef& func_node,
   // mapping from old output tensors, to the outputs of inlined nodes.
   int output_idx = 0;
   for (const OutputArgExpansion& output : item.outputs()) {
-    for (const string& output_tensor : output.output_tensors) {
+    for (const string& output_node : output.output_nodes) {
+      const string inlined_output = inlined_node_name(output_node);
+      const string& output_tensor = output_tensors.at(inlined_output);
+
       const SafeTensorId from_tensor(func_node.name(), output_idx++);
-      const SafeTensorId to_tensor = ParseTensorName(
-          AddPrefixToNodeName(output_tensor, /*prefix=*/func_node.name()));
+      const SafeTensorId to_tensor = ParseTensorName(output_tensor);
+
       ctx->AddTensorMapping(from_tensor, to_tensor);
     }
   }
@@ -1699,7 +1774,7 @@ Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
   if (!ctx.control_overrides().empty()) {
     for (NodeDef& node : *optimized_graph->mutable_node()) {
       // Keep track of new control inputs to the node.
-      gtl::FlatSet<string> add_ctrl_inputs;
+      absl::flat_hash_set<string> add_ctrl_inputs;
 
       // Remove all invalidated control inputs.
       for (int idx = 0; idx < node.input_size(); /* see below */) {
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index 79da7dfa2d805d6dc0fc39b0f5cc312e636cc570..cebd002bed1487baf87e4244a0e38f51ed026561 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -660,7 +660,7 @@ TEST_F(FunctionOptimizerTest, InlineSymbolicGradient_IdentityFunc) {
   test::ExpectTensorEqual<float>(expected[0], optimized[0]);
 }
 
-TEST_F(FunctionOptimizerTest, InlineSymbolicGradient_NoInlineFunc) {
+TEST_F(FunctionOptimizerTest, InlineSymbolicGradientNoInlineFunc) {
   FunctionOptimizer optimizer(RewriterConfig::ON);
 
   FunctionDef func = FunctionDefHelper::Define(
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 8f25a1c8c1c48281fb44c01a142348863836d5aa..e9b706a58371cad72ef4b0652bc86364d7c4f5c0 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -503,6 +503,7 @@ class NodeProcessor : public GraphProcessor {
       UpdateAttrKSize();
       UpdateAttrStrides();
       UpdateAttrDilations();
+      UpdateAttrExplicitPaddings();
       UpdateAttrShape();
       TF_RETURN_IF_ERROR(AddLayoutTransposeToInputs());
       TF_RETURN_IF_ERROR(AddLayoutTransposeToOutputs());
@@ -753,6 +754,28 @@ class NodeProcessor : public GraphProcessor {
     }
   }
 
+  void UpdateAttrExplicitPaddings() {
+    if (node_->attr().find("explicit_paddings") != node_->attr().end()) {
+      auto list = node_->mutable_attr()->at("explicit_paddings").mutable_list();
+      int size = list->i_size();
+      if (size == 8) {
+        int64 height_before = list->i(2);
+        int64 height_after = list->i(3);
+        int64 width_before = list->i(4);
+        int64 width_after = list->i(5);
+        list->set_i(2, 0);
+        list->set_i(3, 0);
+        list->set_i(4, height_before);
+        list->set_i(5, height_after);
+        list->set_i(6, width_before);
+        list->set_i(7, width_after);
+      } else if (size != 0) {
+        LOG(ERROR) << "Cannot handle explicit_paddings attribute of size "
+                   << size;
+      }
+    }
+  }
+
   void UpdateAttrDataFormat() {
     if (node_->attr().find("data_format") != node_->attr().end()) {
       if (node_->attr().at("data_format").s().compare("NHWC") == 0) {
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
index 20e47c1b26b173c18eefd01ba7bdb87781a4c59b..eb2a8e87dde605d7a5867ca84f1c5260c42077e4 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/clusters/single_machine.h"
@@ -80,8 +81,13 @@ class LayoutOptimizerTest : public GrapplerTest {
     Output filter =
         ops::Const(s->WithOpName("Filter"), Input::Initializer(filter_data));
 
+    ops::Conv2D::Attrs attrs;
+    if (padding == "EXPLICIT") {
+      attrs = attrs.ExplicitPaddings({0, 0, 1, 2, 3, 4, 0, 0});
+    }
+
     Output conv = ops::Conv2D(s->WithOpName("Conv2D").WithDevice(device), input,
-                              filter, {1, stride, stride, 1}, padding);
+                              filter, {1, stride, stride, 1}, padding, attrs);
     return conv;
   }
 
@@ -100,6 +106,28 @@ class LayoutOptimizerTest : public GrapplerTest {
     int input_depth = 3;
     int filter_count = 2;
     int stride = 1;
+    int dilation = dilated ? 2 : 1;
+    int64 padding_top = 1;
+    int64 padding_bottom = 2;
+    int64 padding_left = 3;
+    int64 padding_right = 4;
+    int64 output_height;
+    int64 output_width;
+    Padding padding_enum;
+    if (padding == "SAME") {
+      padding_enum = SAME;
+    } else if (padding == "VALID") {
+      padding_enum = VALID;
+    } else {
+      CHECK_EQ(padding, "EXPLICIT");
+      padding_enum = EXPLICIT;
+    }
+    TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
+        input_height, filter_size, dilation, stride, padding_enum,
+        &output_height, &padding_top, &padding_bottom));
+    TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
+        input_width, filter_size, dilation, stride, padding_enum, &output_width,
+        &padding_left, &padding_right));
     TensorShape input_sizes_shape({4});
     Tensor input_data(DT_INT32, input_sizes_shape);
     test::FillValues<int>(&input_data,
@@ -112,8 +140,6 @@ class LayoutOptimizerTest : public GrapplerTest {
     Output filter =
         ops::Variable(s->WithOpName("Filter"), filter_shape, DT_FLOAT);
 
-    int output_height = input_height;
-    int output_width = input_width;
     TensorShape output_shape(
         {batch_size, output_height, output_width, filter_count});
     Tensor output_data(DT_FLOAT, output_shape);
@@ -124,10 +150,21 @@ class LayoutOptimizerTest : public GrapplerTest {
     Output conv_backprop_input;
     Output input_sizes_i =
         ops::Identity(s->WithOpName("InputSizesIdentity"), input_sizes);
-    ops::Conv2DBackpropInput::Attrs attrs;
-    if (dilated) {
-      attrs = attrs.Dilations({1, 2, 2, 1});
+    std::vector<int> dilations{1, dilation, dilation, 1};
+    std::vector<int> explicit_paddings;
+    if (padding == "EXPLICIT") {
+      explicit_paddings = {0,
+                           0,
+                           static_cast<int>(padding_top),
+                           static_cast<int>(padding_bottom),
+                           static_cast<int>(padding_left),
+                           static_cast<int>(padding_right),
+                           0,
+                           0};
     }
+    auto attrs =
+        ops::Conv2DBackpropInput::Attrs().Dilations(dilations).ExplicitPaddings(
+            explicit_paddings);
     if (const_input_size) {
       conv_backprop_input = ops::Conv2DBackpropInput(
           s->WithOpName("Conv2DBackpropInput"), input_sizes, filter, output,
@@ -186,7 +223,7 @@ class LayoutOptimizerTest : public GrapplerTest {
 
 TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "SAME");
+  auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "EXPLICIT");
   Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
@@ -306,6 +343,19 @@ TEST_F(LayoutOptimizerTest, NotEqualSizeWithValidPadding) {
   EXPECT_TRUE(node_map.GetNode("Conv2D-0-TransposeNHWCToNCHW-LayoutOptimizer"));
 }
 
+TEST_F(LayoutOptimizerTest, ExplicitPadding) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "EXPLICIT");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  EXPECT_TRUE(node_map.GetNode("Conv2D-0-TransposeNHWCToNCHW-LayoutOptimizer"));
+}
+
 TEST_F(LayoutOptimizerTest, Pad) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   auto conv = SimpleConv2D(&s, 4, 2, "VALID");
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
index 36064738408c744db53cb9e95645d6a2968b1746..cf5e4db29f418ac560c6a4c6381d4a7f3d88088e 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
@@ -30,12 +30,14 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/grappler/graph_topology_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/optimizers/evaluation_utils.h"
 #include "tensorflow/core/grappler/utils/frame.h"
+#include "tensorflow/core/grappler/utils/traversal.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -451,16 +453,29 @@ Status LoopInvariantNodeMotionOptimizer::Optimize() {
 }
 
 std::vector<int> GetStackPushNodesToConvert(
-    const SimpleGraphView& graph_view,
+    const GraphTopologyView& graph_view,
     const std::unordered_set<string>& nodes_to_preserve, int stack_node_idx) {
   VLOG(1) << "Stack node: " << graph_view.graph()->node(stack_node_idx).name();
+
   const std::unordered_set<string> op_types_to_traverse(
       {"Stack", "StackV2", "Enter", "RefEnter", "Switch", "RefSwitch",
        "Identity", "RefIdentity"});
+  const auto is_op_to_traverse = [&](const NodeDef* node) -> bool {
+    return op_types_to_traverse.find(node->op()) != op_types_to_traverse.end();
+  };
+
   std::vector<int> nodes_to_convert;
-  std::set<int> fanout;
-  graph_view.DepthFirstSearch(op_types_to_traverse, stack_node_idx, &fanout);
-  for (int fanout_idx : fanout) {
+  std::vector<int> fanouts;
+
+  DfsTraversal(graph_view, {graph_view.GetNode(stack_node_idx)},
+               TraversalDirection::kFollowOutputs,
+               DfsPredicates::Advance(is_op_to_traverse),
+               DfsCallbacks::PreOrder([&](const NodeDef* node) {
+                 const absl::optional<int> idx = graph_view.GetNodeIndex(*node);
+                 fanouts.push_back(idx.value());
+               }));
+
+  for (int fanout_idx : fanouts) {
     const NodeDef& fanout_node = graph_view.graph()->node(fanout_idx);
     VLOG(1) << "Fanout " << fanout_idx << " : " << fanout_node.name();
     if (IsStackPushOp(fanout_node)) {
@@ -468,13 +483,12 @@ std::vector<int> GetStackPushNodesToConvert(
       // happen when the graph we have contains only the forward pass for a loop
       // (as when the forward and backward passes are split across different
       // functions).
-      if (graph_view.has_node(fanout_node.input(0))) {
-        const NodeDef* stack_node =
-            &graph_view.node(graph_view.index(fanout_node.input(0)));
+      if (graph_view.HasNode(fanout_node.input(0))) {
+        const NodeDef* stack_node = graph_view.GetNode(fanout_node.input(0));
         while (stack_node->op() != "Stack" && stack_node->op() != "StackV2" &&
                stack_node->input_size() > 0 &&
-               graph_view.has_node(stack_node->input(0))) {
-          stack_node = &graph_view.node(graph_view.index(stack_node->input(0)));
+               graph_view.HasNode(stack_node->input(0))) {
+          stack_node = graph_view.GetNode(stack_node->input(0));
         }
         if (nodes_to_preserve.find(stack_node->name()) ==
             nodes_to_preserve.end()) {
@@ -488,7 +502,7 @@ std::vector<int> GetStackPushNodesToConvert(
                    op_types_to_traverse.end()) {
       continue;
     } else if (!IsStackPopOp(fanout_node) ||
-               (!graph_view.outputs(fanout_idx).empty() ||
+               (!graph_view.GetFanout(fanout_idx).empty() ||
                 nodes_to_preserve.find(fanout_node.name()) !=
                     nodes_to_preserve.end())) {
       // The node is either a stack pop with consumers or something unexpected
@@ -497,14 +511,16 @@ std::vector<int> GetStackPushNodesToConvert(
       break;
     }
   }
+
   return nodes_to_convert;
 }
 
 Status RemoveStackOps(const std::unordered_set<string>& nodes_to_preserve,
                       GraphDef* optimized_graph) {
   NodeMap node_map(optimized_graph);
-  SimpleGraphView graph_view;
-  TF_RETURN_IF_ERROR(graph_view.Initialize(*optimized_graph));
+  GraphTopologyView graph_view;
+  TF_RETURN_IF_ERROR(graph_view.InitializeFromGraph(*optimized_graph));
+
   for (int node_idx = 0; node_idx < optimized_graph->node_size(); ++node_idx) {
     if (IsStackOp(optimized_graph->node(node_idx))) {
       for (int push_node_idx : GetStackPushNodesToConvert(
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index 042e9fa32b12235f07113c576155bcdd01cf472e..b50d50f84245a5910ccf9cde5166465f4d9e9310 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_memory.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/costs/utils.h"
+#include "tensorflow/core/grappler/graph_topology_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
@@ -188,13 +189,14 @@ std::vector<RecomputedSubGraph> GetOpGroupsToRecompute(
       }
     }
     // Recompute only nodes which eventually feed into a target node.
-    connected_subgraph(node_map,
-                       true,   // Collect inputs
-                       false,  // Collect outputs
-                       [&unpruned_recompute_nodes](const NodeDef& node) {
-                         return unpruned_recompute_nodes.count(&node) != 0;
-                       },
-                       &current_recomputation.recomputed_source_nodes);
+    connected_subgraph(
+        node_map,
+        true,   // Collect inputs
+        false,  // Collect outputs
+        [&unpruned_recompute_nodes](const NodeDef& node) {
+          return unpruned_recompute_nodes.count(&node) != 0;
+        },
+        &current_recomputation.recomputed_source_nodes);
     if (current_recomputation.target_nodes.empty()) {
       continue;
     }
@@ -498,6 +500,16 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
   // Look for AddN nodes (and equivalent) and record input names.
   MutableGraphView view(&item->graph);
 
+  // It's ok to use immutable GraphTopologyView here, because we do not destroy
+  // any of the nodes in the underlying graph, we only add new nodes.
+  GraphTopologyView graph_topology;
+  Status initialized_topology = graph_topology.InitializeFromGraph(item->graph);
+  if (!initialized_topology.ok()) {
+    VLOG(1) << "Failed to initialize graph topology view: "
+            << initialized_topology.error_message();
+    return false;
+  }
+
   std::unordered_map<string, std::unordered_set<NodeDef*>> addn_list;
   for (NodeDef& node : *item->graph.mutable_node()) {
     if (!IsAddN(node) && node.op() != "AccumulateNV2") {
@@ -579,12 +591,11 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
 
     // Compute a topological ordering for the node fanin.
     std::unordered_map<const NodeDef*, int> topo_order;
-    ReverseDfs(view, {node}, nullptr,
-               [&topo_order](const NodeDef* n) {
-                 int topo_index = topo_order.size();
-                 topo_order[n] = topo_index;
-               },
-               nullptr);
+    DfsTraversal(graph_topology, {node}, TraversalDirection::kFollowInputs,
+                 DfsCallbacks::PostOrder([&topo_order](const NodeDef* n) {
+                   int topo_index = static_cast<int>(topo_order.size());
+                   topo_order[n] = topo_index;
+                 }));
 
     std::vector<int> input_topo_index;
 
@@ -1259,46 +1270,55 @@ Status RelaxAllocatorConstraints(GraphDef* optimized_graph) {
     return Status::OK();
   }
 
-  std::unordered_set<int> optimized_nodes;
-  SimpleGraphView graph_view;
-  TF_RETURN_IF_ERROR(graph_view.Initialize(*optimized_graph));
+  GraphTopologyView graph_view;
+  TF_RETURN_IF_ERROR(graph_view.InitializeFromGraph(*optimized_graph));
+  std::unordered_set<const NodeDef*> optimized_nodes;
+
   for (int i : assign_nodes) {
-    if (optimized_nodes.find(i) == optimized_nodes.end()) {
-      const NodeDef& assign_node = optimized_graph->node(i);
-      optimized_nodes.insert(i);
-      std::vector<int> assign_nodes_in_fanout;
-      assign_nodes_in_fanout.push_back(i);
-      std::set<int> transitive_fanout;
-      graph_view.DepthFirstSearch(std::unordered_set<string>{}, i,
-                                  &transitive_fanout);
+    const NodeDef& assign_node = optimized_graph->node(i);
+
+    if (optimized_nodes.find(&assign_node) == optimized_nodes.end()) {
+      std::vector<const NodeDef*> assign_nodes_in_fanout;
+      optimized_nodes.insert(&assign_node);
+      assign_nodes_in_fanout.push_back(&assign_node);
+
+      std::vector<const NodeDef*> transitive_fanout;
+      DfsTraversal(graph_view, {graph_view.GetNode(i)},
+                   TraversalDirection::kFollowOutputs,
+                   DfsCallbacks::PreOrder([&](const NodeDef* node) {
+                     transitive_fanout.push_back(node);
+                   }));
+
       bool relax_constraint = true;
       // If all nodes in the transitive fanout are on the same device as the
       // assign node, there is no need to allocate the output in pinned memory.
-      for (int fanout : transitive_fanout) {
-        const NodeDef& fanout_node = optimized_graph->node(fanout);
+      for (const NodeDef* fanout_node : transitive_fanout) {
+        // const NodeDef& fanout_node = optimized_graph->node(fanout);
         if (relax_constraint &&
-            (IsSend(fanout_node) ||
-             CrossesTaskOrCpuGpuBoundary(fanout_node, assign_node))) {
+            (IsSend(*fanout_node) ||
+             CrossesTaskOrCpuGpuBoundary(*fanout_node, assign_node))) {
           relax_constraint = false;
           break;
         }
-        if (optimized_nodes.find(fanout) == optimized_nodes.end() &&
-            IsAssign(fanout_node)) {
-          assign_nodes_in_fanout.push_back(fanout);
+        if (optimized_nodes.find(fanout_node) == optimized_nodes.end() &&
+            IsAssign(*fanout_node)) {
+          assign_nodes_in_fanout.push_back(fanout_node);
         }
       }
 
       if (relax_constraint) {
-        for (int assign_idx : assign_nodes_in_fanout) {
+        for (const NodeDef* assign_node_in_fanout : assign_nodes_in_fanout) {
           // If all devices match in fanout of node(i) then, by transitivity,
           // they must also match in the fanout of other assign nodes
           // in the fanout of node(i), so we can process them here,
           // and save computing their transitive fanout later.
-          optimized_nodes.insert(assign_idx);
+          optimized_nodes.insert(assign_node_in_fanout);
 
           // Set an attribute telling AssignOp to ignore allocator constraints.
+          const absl::optional<int> assign_node_idx =
+              graph_view.GetNodeIndex(*assign_node_in_fanout);
           NodeDef* assign_node_to_relax =
-              optimized_graph->mutable_node(assign_idx);
+              optimized_graph->mutable_node(assign_node_idx.value());
           (*assign_node_to_relax
                 ->mutable_attr())["_grappler_relax_allocator_constraints"]
               .set_b(true);
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 7b788c613c9c1c42e62f69bf2dab1122b08c4f9a..a84bb1d62f7d824631a1c25be23535f9d33d54dc 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/auto_parallel.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
@@ -425,6 +427,14 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   VLOG(1) << "Starting optimization for grappler item: " << item.id;
   optimization_results_.clear();
 
+  // Constructs a FunctionLibraryDefinition with functions that are reachable
+  // from the nodes of the graph.
+  const auto minimized_flib =
+      [](const GraphDef& graph) -> FunctionLibraryDefinition {
+    return FunctionLibraryDefinition(OpRegistry::Global(), graph.library())
+        .ReachableDefinitions(graph);
+  };
+
   // 0. Original graph might contain a huge function library, that is mostly
   // unused. This library copied over by each individual Grappler optimizer,
   // which adds a huge overhead. Before starting optimization passes we just
@@ -434,11 +444,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   GraphDef trimmed_graph;  // do not copy graph with a potentially huge library
   *trimmed_graph.mutable_node() = item.graph.node();
   *trimmed_graph.mutable_versions() = item.graph.versions();
-  *trimmed_graph.mutable_library() =
-      grappler::ReachableFunctionLibraryDefinition(
-          FunctionLibraryDefinition(OpRegistry::Global(), item.graph.library()),
-          item.graph)
-          .ToProto();
+  *trimmed_graph.mutable_library() = minimized_flib(item.graph).ToProto();
 
   GrapplerItem trimmed_item = item.WithGraph(std::move(trimmed_graph));
 
@@ -470,10 +476,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   }
 
   // 2. Optimize functions reachable from the optimized graph.
-  FunctionLibraryDefinition flib = ReachableFunctionLibraryDefinition(
-      FunctionLibraryDefinition(OpRegistry::Global(),
-                                optimized_graph->library()),
-      *optimized_graph);
+  FunctionLibraryDefinition flib = minimized_flib(*optimized_graph);
 
   // Find functions for which we might need to compute a gradient at runtime.
   absl::flat_hash_set<string> differentiable_functions;
@@ -626,5 +629,79 @@ Status RunMetaOptimizer(const GrapplerItem& item, const ConfigProto& cfg,
   return status;
 }
 
+Status OptimizeGraph(std::vector<string> ret_node_names,
+                     FunctionLibraryDefinition* flib,
+                     const DeviceSet& device_set, Device* cpu_device,
+                     const ConfigProto& config_proto,
+                     std::unique_ptr<tensorflow::Graph>* g) {
+  if (!tensorflow::grappler::MetaOptimizerEnabled(config_proto)) {
+    return Status::OK();
+  }
+
+  tensorflow::grappler::GrapplerItem item;
+
+  // Add all available devices so that inlined function can be placed.
+  for (const Device* d : device_set.devices()) {
+    Status added_device = item.AddDevice(d->name());
+    if (!added_device.ok()) VLOG(3) << added_device.error_message();
+  }
+
+  // Add fetches so that the graph can be pruned.
+  item.fetch.swap(ret_node_names);
+
+  (*g)->ToGraphDef(&item.graph);
+
+  if (flib) {
+    *item.graph.mutable_library() = flib->ToProto();
+  }
+
+  tensorflow::GraphDef out_graph;
+
+  tensorflow::grappler::VirtualCluster cluster(&device_set);
+
+  // TODO(nareshmodi): Consider adding and using the more generic GraphOptions
+  // proto (which also contain the OptimizerOptions).
+  TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
+      item, config_proto, cpu_device, &cluster, &out_graph));
+
+  std::unique_ptr<tensorflow::Graph> optimized_graph(
+      new tensorflow::Graph(OpRegistry::Global()));
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(GraphConstructorOptions(),
+                                            out_graph, optimized_graph.get()));
+
+  // Copy optimized functions back to the overlay lib.
+  if (flib) {
+    for (const FunctionDef& fdef : out_graph.library().function()) {
+      const string& func_name = fdef.signature().name();
+      if (flib->Contains(func_name)) {
+        TF_RETURN_IF_ERROR(flib->ReplaceFunction(func_name, fdef));
+      } else {
+        TF_RETURN_IF_ERROR(flib->AddFunctionDef(fdef));
+      }
+    }
+  }
+
+  *g = std::move(optimized_graph);
+
+  // The graph conversion sets the requested device names but not the
+  // assigned device names. However, since at this point the graph is
+  // placed TF expects an assigned device name for every node. Therefore
+  // we copy the requested device into the assigned device field.
+  for (Node* node : (*g)->nodes()) {
+    if (node->IsOp() && node->assigned_device_name().empty()) {
+      if (node->requested_device().empty()) {
+        return errors::Internal(
+            "Either placer did not place the node or Grappler did not "
+            "copy the assigned device. Contact Grappler team since latter "
+            "is more likely. Node=",
+            node->name(), " Graph: ", (*g)->ToGraphDefDebug().DebugString());
+      }
+      node->set_assigned_device_name(node->requested_device());
+    }
+  }
+
+  return Status::OK();
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
index a06da4394e4b8a4d8e75855a0a432114f7d7fcb3..c972fe3202bcbb6f0e2b29fd79f10cd894ec73de 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -16,7 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
 
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -99,6 +102,27 @@ Status RunMetaOptimizer(const GrapplerItem& item, const ConfigProto& cfg,
                         DeviceBase* cpu_device, Cluster* cluster,
                         GraphDef* optimized_graph);
 
+// Wrapper around RunMetaOptimizer convenient for optimizing
+// function graphs.
+//
+// Runs grappler optimizations on `g` based on `config_proto`.
+// `ret_node_names`: a vector of node names whose outputs are returned,
+//    aka fetches. when `g` represent a function, these are _Retval nodes.
+// `lib`: function library to use with `g`.
+// `device_set`: the set of devices that graph can refer to.
+// `cpu_device`: the CPU device.
+// `config_proto`: Grapper configuration.
+//
+// **g is a graph constructed based on the runtime library 'lib'.
+// OptimizeGraph mutates **g extensively and replaces '*g' with a
+// complete copy. Therefore, the caller should not keep any references
+// to nodes *g.
+Status OptimizeGraph(std::vector<string> ret_node_names,
+                     FunctionLibraryDefinition* lib,
+                     const DeviceSet& device_set, Device* cpu_device,
+                     const ConfigProto& config_proto,
+                     std::unique_ptr<tensorflow::Graph>* g);
+
 }  // namespace grappler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index 12db5d6ca9b001fa04e42e6d228fe6289d87726e..a061f6194a1ea85bcf998956b3abd90d0e322c9a 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -231,7 +231,7 @@ TEST_F(MetaOptimizerTest, RunToggleOptimizersAndCustomGraphOptimizerTwice) {
 TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
   using test::function::NDef;
 
-  // Enable ony function optimization.
+  // Enable only function optimization.
   ConfigProto config_proto;
   auto& rewriter_config =
       *config_proto.mutable_graph_options()->mutable_rewrite_options();
@@ -300,7 +300,7 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
                                            output.library());
 
   // Specialized and optimized functions should be added to the graph.
-  EXPECT_EQ(6, optimized_flib.num_functions());
+  EXPECT_EQ(5, optimized_flib.num_functions());
 
   // Get a specialized function name.
   const auto specialized_name = [](const string& fn, const string& node,
@@ -314,25 +314,22 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
       specialized_name("MyQuadratic", "quadratic", "tf_graph");
 
   // MySquare should be specialized and optimized for 3 instantiations:
-  //   1. 'square' node in the main graph
-  //   2. 'square' node in the MyQuadratic specialization (not in a fetch set)
-  //   3. 'quadratic' node in the MyQuadratic specialization (is in a fetch set)
+  //   1.  'square' node in the main graph
+  //   2.  'square' node in the MyQuadratic specialization
+  //   3*. 'quadratic' node in the MyQuadratic specialization
+  //        has identical instantiation context to #2
 
   const string optimized_1 = specialized_name("MySquare", "square", "tf_graph");
   const string optimized_2 =
       specialized_name("MySquare", "square", optimized_0);
-  const string optimized_3 =
-      specialized_name("MySquare", "quadratic", optimized_0);
 
   const FunctionDef* optimized_func_0 = optimized_flib.Find(optimized_0);
   const FunctionDef* optimized_func_1 = optimized_flib.Find(optimized_1);
   const FunctionDef* optimized_func_2 = optimized_flib.Find(optimized_2);
-  const FunctionDef* optimized_func_3 = optimized_flib.Find(optimized_3);
 
   ASSERT_NE(optimized_func_0, nullptr);
   ASSERT_NE(optimized_func_1, nullptr);
   ASSERT_NE(optimized_func_2, nullptr);
-  ASSERT_NE(optimized_func_3, nullptr);
 
   // Graph should call optimized function.
   int count = 0;
@@ -351,13 +348,13 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
     if (node.name() == "square" && ++count) {
       EXPECT_EQ(optimized_2, node.op());
     } else if (node.name() == "quadratic" && ++count) {
-      EXPECT_EQ(optimized_3, node.op());
+      EXPECT_EQ(optimized_2, node.op());
     }
   }
   EXPECT_EQ(2, count);
 
-  const std::vector<const FunctionDef*> optimized_funcs = {
-      optimized_func_1, optimized_func_2, optimized_func_3};
+  const std::vector<const FunctionDef*> optimized_funcs = {optimized_func_1,
+                                                           optimized_func_2};
 
   // MyMul should be inlined into all optimized versions of MySquare.
   for (const FunctionDef* optimized_func : optimized_funcs) {
@@ -403,6 +400,96 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
   test::ExpectTensorEqual<int>(tensors_expected[1], tensors[1]);
 }
 
+TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryPruneUnusedOutputs) {
+  using test::function::NDef;
+
+  ConfigProto config_proto;
+  MetaOptimizer optimizer(nullptr, config_proto);
+
+  // MyMul computes x*y three times and has three output values.
+  FunctionDef my_mul = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z0:T", "z1:T", "z2:T"}, {"T: {float, int32}"},
+      {{{"output0"}, "Mul", {"x", "y"}, {{"T", "$T"}}},
+       {{"output1"}, "Mul", {"x", "y"}, {{"T", "$T"}}},
+       {{"output2"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z0", "output0:z:0"}, {"z1", "output1:z:0"}, {"z2", "output2:z:0"}});
+
+  // Call MyMyl and forward all three outputs.
+  FunctionDef my_fwd = FunctionDefHelper::Create(
+      "Fwd", {"x:T", "y:T"}, {"z0:T", "z1:T", "z2:T"}, {"T: {float, int32}"},
+      {{{"output"}, "MyMul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z0", "output:z0:0"}, {"z1", "output:z1:0"}, {"z2", "output:z2:0"}});
+
+  // Mark both functions as `_noinline` to trigger specialization.
+  (*my_mul.mutable_attr())["_noinline"].set_b(true);
+  (*my_fwd.mutable_attr())["_noinline"].set_b(true);
+  std::vector<FunctionDef> function_library = {my_mul, my_fwd};
+
+  // Tensorflow graph:
+  //   a = Placeholder[T=float]
+  //   b = Placeholder[T=float]
+  //   fwd = Fwd(a, b)
+  //
+  // Fetch fwd:2 via Identity node.
+  GrapplerItem item;
+  item.id = "tf_graph";
+  item.fetch = {"ret"};
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("fwd", "Fwd", {"a", "b"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("ret", "Identity", {"fwd:2"}, {{"T", DT_FLOAT}}, kDevice)},
+      function_library);
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  FunctionLibraryDefinition optimized_flib(OpRegistry::Global(),
+                                           output.library());
+
+  // Specialized functions should be added to the graph.
+  EXPECT_EQ(3, optimized_flib.num_functions());
+
+  // Expected names of the specialized functions.
+  const string specialized_my_fwd = "Fwd_specialized_for_fwd_at_tf_graph";
+  const string specialized_my_mul =
+      absl::StrCat("MyMul_specialized_for_output_at_", specialized_my_fwd);
+
+  // Specialized MyMul should have just one output argument.
+  FunctionDef expected_my_mul = FunctionDefHelper::Create(
+      specialized_my_mul, {"x:float", "y:float"}, {"z2:float"}, {},
+      {{{"output2"}, "Mul", {"x", "y"}, {{"T", DT_FLOAT}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z2", "output2:z:0"}});
+
+  // Specialized Fwd should also have just one output argument.
+  FunctionDef expected_my_fwd = FunctionDefHelper::Create(
+      specialized_my_fwd, {"x:float", "y:float"}, {"z2:float"}, {},
+      {{{"output"}, specialized_my_mul, {"x", "y"}, {{"T", DT_FLOAT}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z2", "output:z2:0"}});
+
+  const FunctionDef* my_mul_spec = optimized_flib.Find(specialized_my_mul);
+  const FunctionDef* my_fwd_spec = optimized_flib.Find(specialized_my_fwd);
+
+  ASSERT_NE(my_mul_spec, nullptr);
+  ASSERT_NE(my_fwd_spec, nullptr);
+
+  CompareFunctions(expected_my_mul, *my_mul_spec);
+  CompareFunctions(expected_my_fwd, *my_fwd_spec);
+
+  item.feed.emplace_back("a", test::AsScalar<float>(2.0f));
+  item.feed.emplace_back("b", test::AsScalar<float>(4.0f));
+  auto tensors_expected = EvaluateFetchNodes(item);
+
+  GrapplerItem optimized = item.WithGraph(std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
 TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryPruneFunctionBody) {
   using test::function::NDef;
 
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index f0c81f29e687aa792df74b69b4c063090a707e61..0869e3b49bf8f07cd4377299cdc303e07adfb03b 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/symbolic_shapes.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -60,17 +61,30 @@ struct RemapperContext {
 
 // FusedBatchNorm that can be replaced with a cheaper set of primitives.
 struct FusedBatchNorm {
+  FusedBatchNorm() = default;
+  explicit FusedBatchNorm(const NodeDef* fused_batch_norm)
+      : fused_batch_norm(fused_batch_norm) {}
+
   const NodeDef* fused_batch_norm = nullptr;
 };
 
 // Conv2D node followed by a BiasAdd.
 struct Conv2DWithBiasAdd {
+  Conv2DWithBiasAdd() = default;
+  Conv2DWithBiasAdd(const NodeDef* conv2d, const NodeDef* bias_add)
+      : conv2d(conv2d), bias_add(bias_add) {}
+
   const NodeDef* conv2d = nullptr;
   const NodeDef* bias_add = nullptr;
 };
 
 // Conv2D node followed by a BiasAdd and Relu.
 struct Conv2DWithBiasAddAndRelu {
+  Conv2DWithBiasAddAndRelu() = default;
+  Conv2DWithBiasAddAndRelu(const NodeDef* conv2d, const NodeDef* bias_add,
+                           const NodeDef* relu)
+      : conv2d(conv2d), bias_add(bias_add), relu(relu) {}
+
   const NodeDef* conv2d = nullptr;
   const NodeDef* bias_add = nullptr;
   const NodeDef* relu = nullptr;
@@ -78,6 +92,11 @@ struct Conv2DWithBiasAddAndRelu {
 
 // Conv2D node followed by a Squeeze and BiasAdd.
 struct Conv2DWithSqueezeAndBiasAdd {
+  Conv2DWithSqueezeAndBiasAdd() = default;
+  Conv2DWithSqueezeAndBiasAdd(const NodeDef* conv2d, const NodeDef* squeeze,
+                              const NodeDef* bias_add)
+      : conv2d(conv2d), squeeze(squeeze), bias_add(bias_add) {}
+
   const NodeDef* conv2d = nullptr;
   const NodeDef* squeeze = nullptr;
   const NodeDef* bias_add = nullptr;
@@ -85,6 +104,11 @@ struct Conv2DWithSqueezeAndBiasAdd {
 
 // Conv2D node followed by a FusedBatchNorm.
 struct Conv2DWithBatchNorm {
+  Conv2DWithBatchNorm() = default;
+  Conv2DWithBatchNorm(const NodeDef* conv2d, const NodeDef* fused_batch_norm,
+                      float epsilon = 0.0)
+      : conv2d(conv2d), fused_batch_norm(fused_batch_norm), epsilon(epsilon) {}
+
   const NodeDef* conv2d = nullptr;
   const NodeDef* fused_batch_norm = nullptr;
   float epsilon = 0.0;
@@ -92,16 +116,23 @@ struct Conv2DWithBatchNorm {
 
 // Conv2D node followed by a FusedBatchNorm and Relu.
 struct Conv2DWithBatchNormAndRelu {
+  Conv2DWithBatchNormAndRelu() = default;
+  Conv2DWithBatchNormAndRelu(const NodeDef* conv2d,
+                             const NodeDef* fused_batch_norm,
+                             const NodeDef* relu, float epsilon = 0.0)
+      : conv2d(conv2d),
+        fused_batch_norm(fused_batch_norm),
+        relu(relu),
+        epsilon(epsilon) {}
+
   const NodeDef* conv2d = nullptr;
   const NodeDef* fused_batch_norm = nullptr;
   const NodeDef* relu = nullptr;
   float epsilon = 0.0;
 };
 
-bool IsFloatOrDoubleDataType(const NodeDef* node,
-                             const string& type_attr = "T") {
-  DataType dtype = GetDataTypeFromAttr(*node, type_attr);
-  return dtype == DT_FLOAT || dtype == DT_DOUBLE;
+bool IsInPreserveSet(const RemapperContext& ctx, const NodeDef* node) {
+  return ctx.nodes_to_preserve.count(node->name()) > 0;
 }
 
 bool HaveSameDataType(const NodeDef* lhs, const NodeDef* rhs,
@@ -119,91 +150,165 @@ bool HasDataType(const NodeDef* node, const DataType& expected,
   return dtype == expected;
 }
 
-bool IsInPreserveSet(const RemapperContext& ctx, const NodeDef* node) {
-  return ctx.nodes_to_preserve.count(node->name()) > 0;
+bool IsCpuCompatibleDataType(const NodeDef* node,
+                             const string& type_attr = "T") {
+  DataType dtype = GetDataTypeFromAttr(*node, type_attr);
+  return dtype == DT_FLOAT || dtype == DT_DOUBLE;
+}
+
+bool IsGpuCompatibleDataType(const NodeDef* node,
+                             const string& type_attr = "T") {
+  DataType dtype = GetDataTypeFromAttr(*node, type_attr);
+  return dtype == DT_FLOAT;
+}
+
+bool IsCpuCompatibleDataFormat(const NodeDef* conv2d) {
+  DCHECK(IsConv2D(*conv2d)) << "Expected Conv2D op";
+  const string& data_format = conv2d->attr().at(kDataFormat).s();
+  return data_format == "NHWC";
+}
+
+bool IsGpuCompatibleDataFormat(const NodeDef* conv2d) {
+  DCHECK(IsConv2D(*conv2d)) << "Expected Conv2D op";
+  const string& data_format = conv2d->attr().at(kDataFormat).s();
+  return data_format == "NHWC" || data_format == "NCHW";
 }
 
-bool FindConv2DWithBias(const RemapperContext& ctx, const NodeDef* node,
-                        Conv2DWithBiasAdd* matched) {
+bool IsCpuCompatibleConv2D(const NodeDef* conv2d) {
+  DCHECK(IsConv2D(*conv2d)) << "Expected Conv2D op";
+  return NodeIsOnCpu(conv2d) && IsCpuCompatibleDataType(conv2d) &&
+         IsCpuCompatibleDataFormat(conv2d);
+}
+
+bool IsGpuCompatibleConv2D(const NodeDef* conv2d) {
+  DCHECK(IsConv2D(*conv2d)) << "Expected Conv2D op";
+  return NodeIsOnGpu(conv2d) && IsGpuCompatibleDataType(conv2d) &&
+         IsGpuCompatibleDataFormat(conv2d);
+}
+
+// Checks if we can rewrite a pattern to the `_FusedConv2D` on CPU device.
+template <typename Pattern>
+bool IsCpuCompatible(const Pattern& matched) {
+  return IsCpuCompatibleConv2D(matched.conv2d);
+}
+
+// Checks if we can rewrite a pattern to the `_FusedConv2D` on GPU device.
+bool IsGpuCompatible(const RemapperContext& ctx,
+                     const Conv2DWithBiasAddAndRelu& matched) {
+  const std::vector<OpInfo::TensorProperties>& input_props =
+      ctx.graph_properties.GetInputProperties(matched.conv2d->name());
+  const TensorShapeProto& filter_shape =
+      input_props.size() >= 2 ? input_props[1].shape() : TensorShapeProto();
+
+  // FusedConv2D on GPU with 1x1 convolution is marginally faster than
+  // in-graph computation in micro benchmarks (see kernels/conv_ops_test.cc),
+  // and significantly slower in large scale benchmarks.
+  bool is_spatial_conv = Rank(filter_shape) == 4 &&          //
+                         IsKnown(filter_shape.dim(1)) &&     //
+                         IsKnown(filter_shape.dim(2)) &&     //
+                         filter_shape.dim(1).size() != 1 &&  //
+                         filter_shape.dim(2).size() != 1;
+
+  return is_spatial_conv && IsGpuCompatibleConv2D(matched.conv2d);
+}
+bool IsGpuCompatible(const RemapperContext& ctx,
+                     const Conv2DWithBiasAdd& matched) {
+  return false;
+}
+bool IsGpuCompatible(const RemapperContext& ctx,
+                     const Conv2DWithSqueezeAndBiasAdd& matched) {
+  return false;
+}
+
+// Returns true if the given pattern is supported on the assigned device.
+template <typename Pattern>
+bool IsDeviceCompatible(const RemapperContext& ctx, Pattern& matched) {
+  return IsCpuCompatible(matched) || IsGpuCompatible(ctx, matched);
+}
+
+bool FindConv2DWithBias(const RemapperContext& ctx, const NodeDef* bias_add,
+                        Conv2DWithBiasAdd* matched,
+                        bool check_device_compatible = true) {
   if (!EigenSupportsContractionOutputKernel()) return false;
 
   // Root of the pattern must be a BiasAdd.
-  if (!node) return false;
-  if (!IsBiasAdd(*node)) return false;
-  if (!NodeIsOnCpu(node)) return false;
-  if (!IsFloatOrDoubleDataType(node)) return false;
-  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+  if (bias_add == nullptr || !IsBiasAdd(*bias_add) ||
+      HasControlFaninOrFanout(ctx.graph_view, bias_add))
+    return false;
 
-  // Input to the BiasAdd must be a Conv2D in NHWC format.
-  const auto input_port = GraphView::InputPort(node, 0);
+  // Input to the BiasAdd must be a Conv2D.
+  const auto input_port = GraphView::InputPort(bias_add, 0);
   const auto conv2d = ctx.graph_view.GetRegularFanin(input_port);
-  if (!conv2d.node) return false;
-  if (!IsConv2D(*conv2d.node)) return false;
-  if (conv2d.node->attr().at(kDataFormat).s() != "NHWC") return false;
-  if (!NodeIsOnCpu(conv2d.node)) return false;
-  if (!HaveSameDataType(node, conv2d.node)) return false;
-  if (!NoControlFaninOrFanout(ctx.graph_view, conv2d.node)) return false;
-  if (!HasSingleFanoutNode(ctx.graph_view, conv2d.node)) return false;
-  if (IsInPreserveSet(ctx, conv2d.node)) return false;
+
+  if (!conv2d.node || !IsConv2D(*conv2d.node) ||
+      !HaveSameDataType(bias_add, conv2d.node) ||
+      HasControlFaninOrFanout(ctx.graph_view, conv2d.node) ||
+      !HasSingleFanoutNode(ctx.graph_view, conv2d.node) ||
+      IsInPreserveSet(ctx, conv2d.node))
+    return false;
+
+  // Check that data type and data format are supported on assigned device.
+  const Conv2DWithBiasAdd pattern{conv2d.node, bias_add};
+  if (check_device_compatible && !IsDeviceCompatible(ctx, pattern)) {
+    return false;
+  }
 
   // We successfully found a Conv2D+BiasAdd pattern.
-  matched->conv2d = conv2d.node;
-  matched->bias_add = node;
+  *matched = pattern;
 
   return true;
 }
 
-bool FindConv2DWithBiasAndRelu(const RemapperContext& ctx, const NodeDef* node,
+bool FindConv2DWithBiasAndRelu(const RemapperContext& ctx, const NodeDef* relu,
                                Conv2DWithBiasAddAndRelu* matched) {
   if (!EigenSupportsContractionOutputKernel()) return false;
 
   // Root of the pattern must be a Relu.
-  if (!node) return false;
-  if (!IsRelu(*node)) return false;
-  if (!NodeIsOnCpu(node)) return false;
-  if (!IsFloatOrDoubleDataType(node)) return false;
-  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+  if (!relu || !IsRelu(*relu) || HasControlFaninOrFanout(ctx.graph_view, relu))
+    return false;
 
   // And input to Relu must match Conv2DWithBiasAdd pattern.
-  const auto input_port = GraphView::InputPort(node, 0);
+  const auto input_port = GraphView::InputPort(relu, 0);
   const auto bias_add = ctx.graph_view.GetRegularFanin(input_port);
 
   Conv2DWithBiasAdd base;
-  if (!FindConv2DWithBias(ctx, bias_add.node, &base)) return false;
-  if (!HasSingleFanoutNode(ctx.graph_view, base.bias_add)) return false;
-  if (!HaveSameDataType(node, base.bias_add)) return false;
-  if (IsInPreserveSet(ctx, base.bias_add)) return false;
+  if (!FindConv2DWithBias(ctx, bias_add.node, &base,
+                          /*check_device_compatible=*/false) ||
+      !HasSingleFanoutNode(ctx.graph_view, base.bias_add) ||
+      !HaveSameDataType(relu, base.bias_add) ||
+      IsInPreserveSet(ctx, base.bias_add))
+    return false;
+
+  // Check that data type and data format are supported on assigned device.
+  const Conv2DWithBiasAddAndRelu pattern{base.conv2d, base.bias_add, relu};
+  if (!IsDeviceCompatible(ctx, pattern)) return false;
 
   // We successfully found a Conv2D+BiasAdd+Relu pattern.
-  matched->conv2d = base.conv2d;
-  matched->bias_add = base.bias_add;
-  matched->relu = node;
+  *matched = pattern;
 
   return true;
 }
 
 bool FindConv2DWithSqueezeAndBias(const RemapperContext& ctx,
-                                  const NodeDef* node,
+                                  const NodeDef* bias_add,
                                   Conv2DWithSqueezeAndBiasAdd* matched) {
   if (!EigenSupportsContractionOutputKernel()) return false;
 
   // Root of the pattern must be a BiasAdd.
-  if (node == nullptr) return false;
-  if (node->op() != "BiasAdd") return false;
-  if (!NodeIsOnCpu(node)) return false;
-  if (!IsFloatOrDoubleDataType(node)) return false;
-  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+  if (!bias_add || !IsBiasAdd(*bias_add) ||
+      HasControlFaninOrFanout(ctx.graph_view, bias_add))
+    return false;
 
   // Input to the BiasAdd must be a Squeeze.
-  const auto bias_input_port = GraphView::InputPort(node, 0);
+  const auto bias_input_port = GraphView::InputPort(bias_add, 0);
   const auto squeeze = ctx.graph_view.GetRegularFanin(bias_input_port);
-  if (squeeze.node == nullptr) return false;
-  if (squeeze.node->op() != "Squeeze") return false;
-  if (!NodeIsOnCpu(squeeze.node)) return false;
-  if (!HaveSameDataType(node, squeeze.node, "T")) return false;
-  if (!NoControlFaninOrFanout(ctx.graph_view, squeeze.node)) return false;
-  if (!HasSingleFanoutNode(ctx.graph_view, squeeze.node)) return false;
-  if (IsInPreserveSet(ctx, squeeze.node)) return false;
+
+  if (!squeeze.node || !IsSqueeze(*squeeze.node) ||
+      !HaveSameDataType(bias_add, squeeze.node, "T") ||
+      HasControlFaninOrFanout(ctx.graph_view, squeeze.node) ||
+      !HasSingleFanoutNode(ctx.graph_view, squeeze.node) ||
+      IsInPreserveSet(ctx, squeeze.node))
+    return false;
 
   // Squeeze must not squeeze output channel dimension.
   std::vector<int32> dims;
@@ -212,67 +317,72 @@ bool FindConv2DWithSqueezeAndBias(const RemapperContext& ctx,
     if (dim == 3) return false;
   }
 
-  // Input to the Squeeze must be a Conv2D in NHWC format.
+  // Input to the Squeeze must be a Conv2D.
   const auto squeeze_input_port = GraphView::InputPort(squeeze.node, 0);
   const auto conv2d = ctx.graph_view.GetRegularFanin(squeeze_input_port);
-  if (conv2d.node == nullptr) return false;
-  if (conv2d.node->op() != "Conv2D") return false;
-  if (conv2d.node->attr().at("data_format").s() != "NHWC") return false;
-  if (!NodeIsOnCpu(conv2d.node)) return false;
-  if (!HaveSameDataType(node, conv2d.node, "T")) return false;
-  if (!NoControlFaninOrFanout(ctx.graph_view, conv2d.node)) return false;
-  if (!HasSingleFanoutNode(ctx.graph_view, conv2d.node)) return false;
-  if (IsInPreserveSet(ctx, conv2d.node)) return false;
+
+  if (!conv2d.node || !IsConv2D(*conv2d.node) ||
+      !HaveSameDataType(bias_add, conv2d.node, "T") ||
+      HasControlFaninOrFanout(ctx.graph_view, conv2d.node) ||
+      !HasSingleFanoutNode(ctx.graph_view, conv2d.node) ||
+      IsInPreserveSet(ctx, conv2d.node))
+    return false;
+
+  // Check that data type and data format are supported on assigned device.
+  const Conv2DWithSqueezeAndBiasAdd pattern{conv2d.node, squeeze.node,
+                                            bias_add};
+  if (!IsDeviceCompatible(ctx, pattern)) return false;
 
   // We successfully found a Conv2D+Squeeze+BiasAdd pattern.
-  matched->conv2d = conv2d.node;
-  matched->squeeze = squeeze.node;
-  matched->bias_add = node;
+  *matched = pattern;
 
   return true;
 }
 
-bool FindConv2DWithBatchNorm(const RemapperContext& ctx, const NodeDef* node,
+bool FindConv2DWithBatchNorm(const RemapperContext& ctx,
+                             const NodeDef* batch_norm,
                              Conv2DWithBatchNorm* matched) {
   if (!EigenSupportsContractionOutputKernel()) return false;
 
   // Root of the pattern must be a FusedBatchNorm or a FusedBatchNormV2.
-  if (node == nullptr) return false;
-  if (!IsFusedBatchNorm(*node)) return false;
-  if (!NodeIsOnCpu(node)) return false;
-  if (!HasDataType(node, DT_FLOAT)) return false;
+  if (!batch_norm || !IsFusedBatchNorm(*batch_norm)) return false;
 
   // V2 has a separate data type for the scale/offset/mean/variance inputs.
-  if (node->op() == "FusedBatchNormV2" && !HasDataType(node, DT_FLOAT, "U"))
+  if (batch_norm->op() == "FusedBatchNormV2" &&
+      !HasDataType(batch_norm, DT_FLOAT, "U"))
     return false;
 
   // Check that batch normalization is in inference mode.
-  const auto& attr = node->attr();
+  const auto& attr = batch_norm->attr();
   if (attr.count(kIsTraining) > 0 && attr.at(kIsTraining).b()) return false;
 
   // Check that only 0th output is consumed by other nodes.
-  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
-  if (HasFanouts(ctx.graph_view, node, 1)) return false;  // batch_mean
-  if (HasFanouts(ctx.graph_view, node, 2)) return false;  // batch_variance
-  if (HasFanouts(ctx.graph_view, node, 3)) return false;  // reserve_space_1
-  if (HasFanouts(ctx.graph_view, node, 4)) return false;  // reserve_space_2
+  if (HasControlFaninOrFanout(ctx.graph_view, batch_norm) ||
+      HasFanouts(ctx.graph_view, batch_norm, 1) ||  // batch_mean
+      HasFanouts(ctx.graph_view, batch_norm, 2) ||  // batch_variance
+      HasFanouts(ctx.graph_view, batch_norm, 3) ||  // reserve_space_1
+      HasFanouts(ctx.graph_view, batch_norm, 4))    // reserve_space_2
+    return false;
 
-  // Input to the FusedBatchNorm must be a Conv2D in NHWC format.
-  const auto input_port = GraphView::InputPort(node, 0);
+  // Input to the FusedBatchNorm must be a Conv2D.
+  const auto input_port = GraphView::InputPort(batch_norm, 0);
   const auto conv2d = ctx.graph_view.GetRegularFanin(input_port);
-  if (conv2d.node == nullptr) return false;
-  if (!IsConv2D(*conv2d.node)) return false;
-  if (conv2d.node->attr().at(kDataFormat).s() != "NHWC") return false;
-  if (!NodeIsOnCpu(conv2d.node)) return false;
-  if (!HaveSameDataType(node, conv2d.node)) return false;
-  if (!NoControlFaninOrFanout(ctx.graph_view, conv2d.node)) return false;
-  if (!HasSingleFanoutNode(ctx.graph_view, conv2d.node)) return false;
-  if (IsInPreserveSet(ctx, conv2d.node)) return false;
+
+  if (!conv2d.node || !IsConv2D(*conv2d.node) ||               //
+      !NodeIsOnCpu(conv2d.node) ||                             //
+      !HaveSameDataType(batch_norm, conv2d.node) ||            //
+      !IsCpuCompatibleDataType(conv2d.node) ||                 //
+      !IsCpuCompatibleDataFormat(conv2d.node) ||               //
+      HasControlFaninOrFanout(ctx.graph_view, conv2d.node) ||  //
+      !HasSingleFanoutNode(ctx.graph_view, conv2d.node) ||     //
+      IsInPreserveSet(ctx, conv2d.node))
+    return false;
 
   // We successfully found a Conv2D+FusedBatchNorm pattern.
   matched->conv2d = conv2d.node;
-  matched->fused_batch_norm = node;
-  if (!GetNodeAttr(*node, "epsilon", &matched->epsilon).ok()) return false;
+  matched->fused_batch_norm = batch_norm;
+  if (!GetNodeAttr(*batch_norm, "epsilon", &matched->epsilon).ok())
+    return false;
 
   return true;
 }
@@ -283,21 +393,19 @@ bool FindConv2DWithBatchNormAndRelu(const RemapperContext& ctx,
   if (!EigenSupportsContractionOutputKernel()) return false;
 
   // Root of the pattern must be a Relu.
-  if (node == nullptr) return false;
-  if (!IsRelu(*node)) return false;
-  if (!NodeIsOnCpu(node)) return false;
-  if (!IsFloatOrDoubleDataType(node)) return false;
-  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+  if (!node || !IsRelu(*node) || HasControlFaninOrFanout(ctx.graph_view, node))
+    return false;
 
   // And input to Relu must match Conv2DWithBatchNorm pattern.
   const auto input_port = GraphView::InputPort(node, 0);
   const auto batch_norm = ctx.graph_view.GetRegularFanin(input_port);
 
   Conv2DWithBatchNorm base;
-  if (!FindConv2DWithBatchNorm(ctx, batch_norm.node, &base)) return false;
-  if (!HasSingleFanoutNode(ctx.graph_view, base.fused_batch_norm)) return false;
-  if (!HaveSameDataType(node, base.fused_batch_norm)) return false;
-  if (IsInPreserveSet(ctx, base.fused_batch_norm)) return false;
+  if (!FindConv2DWithBatchNorm(ctx, batch_norm.node, &base) ||
+      !HasSingleFanoutNode(ctx.graph_view, base.fused_batch_norm) ||
+      !HaveSameDataType(node, base.fused_batch_norm) ||
+      IsInPreserveSet(ctx, base.fused_batch_norm))
+    return false;
 
   // We successfully found a Conv2D+FusedBatchNorm+Relu pattern.
   matched->conv2d = base.conv2d;
@@ -355,9 +463,7 @@ bool FindFusedBatchNorm(const RemapperContext& ctx, const NodeDef* node,
   return true;
 }
 
-void CopyConv2DAttributes(const NodeDef* conv2d, NodeDef* fused_conv2d,
-                          const std::vector<string>& fused_ops = {},
-                          int num_args = 1, float epsilon = 0.0) {
+void CopyConv2DAttributes(const NodeDef* conv2d, NodeDef* fused_conv2d) {
   auto* attr = fused_conv2d->mutable_attr();
   auto src_attr = conv2d->attr();
 
@@ -367,53 +473,65 @@ void CopyConv2DAttributes(const NodeDef* conv2d, NodeDef* fused_conv2d,
   (*attr)["dilations"] = src_attr.at("dilations");
   (*attr)["data_format"] = src_attr.at("data_format");
   (*attr)["use_cudnn_on_gpu"] = src_attr.at("use_cudnn_on_gpu");
+}
 
-  auto* fused_ops_attr = (*attr)["fused_ops"].mutable_list();
-  for (const string& fused_op : fused_ops) {
-    fused_ops_attr->add_s(fused_op);
-  }
-
+void SetFusedConv2DAttributes(
+    NodeDef* fused_conv2d, const absl::Span<const absl::string_view> fused_ops,
+    int num_args = 1, float epsilon = 0.0) {
+  auto* attr = fused_conv2d->mutable_attr();
+  SetAttrValue(fused_ops, &(*attr)["fused_ops"]);
   SetAttrValue(num_args, &(*attr)["num_args"]);
-  // Required only for FusedBatchNorm.
-  SetAttrValue(epsilon, &(*attr)["epsilon"]);
+  SetAttrValue(epsilon, &(*attr)["epsilon"]);  // required only for BatchNorm
 }
 
 void AddFusedConv2DNode(
-    const Conv2DWithBiasAdd& matched, GraphDef* optimized_graph,
+    const RemapperContext& ctx, const Conv2DWithBiasAdd& matched,
+    GraphDef* optimized_graph,
     absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
-  VLOG(2) << "Fuse Conv2D with BiasAdd: bias_add=" << matched.bias_add->name()
+  DCHECK(IsDeviceCompatible(ctx, matched))
+      << "Unsupported fused Conv2D pattern";
+
+  VLOG(2) << "Fuse Conv2D with BiasAdd: "
+          << " bias_add=" << matched.bias_add->name()
           << " conv2d=" << matched.conv2d->name();
 
   NodeDef* fused_conv2d = optimized_graph->add_node();
-  fused_conv2d->set_name(matched.bias_add->name());
   fused_conv2d->set_op(kFusedConv2D);
-  fused_conv2d->set_device(matched.bias_add->device());
+  fused_conv2d->set_name(matched.bias_add->name());
+  fused_conv2d->set_device(matched.conv2d->device());
   fused_conv2d->add_input(matched.conv2d->input(0));    // 0: input
   fused_conv2d->add_input(matched.conv2d->input(1));    // 1: filter
   fused_conv2d->add_input(matched.bias_add->input(1));  // 2: bias
 
-  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"BiasAdd"});
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d);
+  SetFusedConv2DAttributes(fused_conv2d, {"BiasAdd"});
 
   invalidated_nodes->insert(matched.bias_add);
   invalidated_nodes->insert(matched.conv2d);
 }
 
 void AddFusedConv2DNode(
-    const Conv2DWithBiasAddAndRelu& matched, GraphDef* optimized_graph,
+    const RemapperContext& ctx, const Conv2DWithBiasAddAndRelu& matched,
+    GraphDef* optimized_graph,
     absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
-  VLOG(2) << "Fuse Conv2D with BiasAdd and Relu: relu=" << matched.relu->name()
+  DCHECK(IsDeviceCompatible(ctx, matched))
+      << "Unsupported fused Conv2D pattern";
+
+  VLOG(2) << "Fuse Conv2D with BiasAdd and Relu: "
+          << " relu=" << matched.relu->name()
           << " bias_add=" << matched.bias_add->name()
           << " conv2d=" << matched.conv2d->name();
 
   NodeDef* fused_conv2d = optimized_graph->add_node();
   fused_conv2d->set_name(matched.relu->name());
   fused_conv2d->set_op(kFusedConv2D);
-  fused_conv2d->set_device(matched.relu->device());
+  fused_conv2d->set_device(matched.conv2d->device());
   fused_conv2d->add_input(matched.conv2d->input(0));    // 0: input
   fused_conv2d->add_input(matched.conv2d->input(1));    // 1: filter
   fused_conv2d->add_input(matched.bias_add->input(1));  // 2: bias
 
-  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"BiasAdd", "Relu"});
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d);
+  SetFusedConv2DAttributes(fused_conv2d, {"BiasAdd", "Relu"});
 
   invalidated_nodes->insert(matched.relu);
   invalidated_nodes->insert(matched.bias_add);
@@ -421,8 +539,12 @@ void AddFusedConv2DNode(
 }
 
 void AddFusedConv2DNode(
-    const Conv2DWithSqueezeAndBiasAdd& matched, GraphDef* optimized_graph,
+    const RemapperContext& ctx, const Conv2DWithSqueezeAndBiasAdd& matched,
+    GraphDef* optimized_graph,
     absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
+  DCHECK(IsDeviceCompatible(ctx, matched))
+      << "Unsupported fused Conv2D pattern";
+
   VLOG(2) << "Fuse Conv2D with Squeeze and BiasAdd: "
           << " bias_add=" << matched.bias_add->name()
           << " squeeze=" << matched.squeeze->name()
@@ -432,13 +554,14 @@ void AddFusedConv2DNode(
   // has single consumer (only the squeeze node).
   NodeDef* fused_conv2d = optimized_graph->add_node();
   fused_conv2d->set_name(matched.conv2d->name());
-  fused_conv2d->set_op("_FusedConv2D");
+  fused_conv2d->set_op(kFusedConv2D);
   fused_conv2d->set_device(matched.conv2d->device());
   fused_conv2d->add_input(matched.conv2d->input(0));    // 0: input
   fused_conv2d->add_input(matched.conv2d->input(1));    // 1: filter
   fused_conv2d->add_input(matched.bias_add->input(1));  // 2: bias
 
-  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"BiasAdd"});
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d);
+  SetFusedConv2DAttributes(fused_conv2d, {"BiasAdd"});
 
   // Replace BiasAdd node with a Squeeze.
   NodeDef* remapped_squeeze = optimized_graph->add_node();
@@ -461,7 +584,7 @@ void AddFusedConv2DNode(
   NodeDef* fused_conv2d = optimized_graph->add_node();
   fused_conv2d->set_name(matched.fused_batch_norm->name());
   fused_conv2d->set_op(kFusedConv2D);
-  fused_conv2d->set_device(matched.fused_batch_norm->device());
+  fused_conv2d->set_device(matched.conv2d->device());
   fused_conv2d->add_input(matched.conv2d->input(0));            // 0: input
   fused_conv2d->add_input(matched.conv2d->input(1));            // 1: filter
   fused_conv2d->add_input(matched.fused_batch_norm->input(1));  // 2: scale
@@ -469,8 +592,9 @@ void AddFusedConv2DNode(
   fused_conv2d->add_input(matched.fused_batch_norm->input(3));  // 4: mean
   fused_conv2d->add_input(matched.fused_batch_norm->input(4));  // 5: variance
 
-  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"FusedBatchNorm"},
-                       /*num_args*/ 4, /*epsilon*/ matched.epsilon);
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d);
+  SetFusedConv2DAttributes(fused_conv2d, {"FusedBatchNorm"},
+                           /*num_args=*/4, /*epsilon=*/matched.epsilon);
 
   invalidated_nodes->insert(matched.fused_batch_norm);
   invalidated_nodes->insert(matched.conv2d);
@@ -487,7 +611,7 @@ void AddFusedConv2DNode(
   NodeDef* fused_conv2d = optimized_graph->add_node();
   fused_conv2d->set_name(matched.relu->name());
   fused_conv2d->set_op(kFusedConv2D);
-  fused_conv2d->set_device(matched.fused_batch_norm->device());
+  fused_conv2d->set_device(matched.conv2d->device());
   fused_conv2d->add_input(matched.conv2d->input(0));            // 0: input
   fused_conv2d->add_input(matched.conv2d->input(1));            // 1: filter
   fused_conv2d->add_input(matched.fused_batch_norm->input(1));  // 2: scale
@@ -495,8 +619,9 @@ void AddFusedConv2DNode(
   fused_conv2d->add_input(matched.fused_batch_norm->input(3));  // 4: mean
   fused_conv2d->add_input(matched.fused_batch_norm->input(4));  // 5: variance
 
-  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"FusedBatchNorm", "Relu"},
-                       /*num_args*/ 4, /*epsilon*/ matched.epsilon);
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d);
+  SetFusedConv2DAttributes(fused_conv2d, {"FusedBatchNorm", "Relu"},
+                           /*num_args=*/4, /*epsilon=*/matched.epsilon);
 
   invalidated_nodes->insert(matched.relu);
   invalidated_nodes->insert(matched.fused_batch_norm);
@@ -680,13 +805,14 @@ Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
 
     // Remap Conv2D+BiasAdd into the _FusedConv2D.
     if (FindConv2DWithBias(ctx, &node, &conv2d_with_bias)) {
-      AddFusedConv2DNode(conv2d_with_bias, optimized_graph, &invalidated_nodes);
+      AddFusedConv2DNode(ctx, conv2d_with_bias, optimized_graph,
+                         &invalidated_nodes);
       continue;
     }
 
     // Remap Conv2D+BiasAdd+Relu into the _FusedConv2D.
     if (FindConv2DWithBiasAndRelu(ctx, &node, &conv2d_with_bias_and_relu)) {
-      AddFusedConv2DNode(conv2d_with_bias_and_relu, optimized_graph,
+      AddFusedConv2DNode(ctx, conv2d_with_bias_and_relu, optimized_graph,
                          &invalidated_nodes);
       continue;
     }
@@ -694,7 +820,7 @@ Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
     // Remap Conv2D+Squeeze+BiasAdd into the _FusedConv2D+Squeeze.
     if (FindConv2DWithSqueezeAndBias(ctx, &node,
                                      &conv2d_with_squeeze_and_bias)) {
-      AddFusedConv2DNode(conv2d_with_squeeze_and_bias, optimized_graph,
+      AddFusedConv2DNode(ctx, conv2d_with_squeeze_and_bias, optimized_graph,
                          &invalidated_nodes);
       continue;
     }
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index d8fb88d95f25732daa198a3fa1d031ae11981b47..375c3e56c80aa65cd9e5ab0e2248b81d3e3db776 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include <queue>
 #include <vector>
 
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -166,10 +168,10 @@ string AddPrefixToNodeName(const string& name, const string& prefix,
                            const string& delimiter) {
   if (!name.empty()) {
     if (name[0] == '^') {
-      return strings::StrCat("^", prefix, delimiter, name.substr(1));
+      return absl::StrCat("^", prefix, delimiter, name.substr(1));
     }
   }
-  return strings::StrCat(prefix, delimiter, name);
+  return absl::StrCat(prefix, delimiter, name);
 }
 
 string AddPrefixToNodeName(const string& name, const string& prefix) {
@@ -193,20 +195,26 @@ bool ExecuteWithTimeout(std::function<void()> fn, const int64 timeout_in_ms,
 }
 
 string AsControlDependency(const NodeDef& node) {
-  return strings::StrCat("^", node.name());
+  return absl::StrCat("^", node.name());
 }
 
 string AsControlDependency(const string& node_name) {
   CHECK(!node_name.empty());
   return (!node_name.empty() && node_name[0] == '^')
              ? node_name
-             : strings::StrCat("^", node_name);
+             : absl::StrCat("^", node_name);
 }
 
 bool NodeIsOnCpu(const NodeDef* node) {
   string task, device;
   return DeviceNameUtils::SplitDeviceName(node->device(), &task, &device) &&
-         str_util::StartsWith(device, DEVICE_CPU);
+         absl::StartsWith(device, DEVICE_CPU);
+}
+
+bool NodeIsOnGpu(const NodeDef* node) {
+  string task, device;
+  return DeviceNameUtils::SplitDeviceName(node->device(), &task, &device) &&
+         absl::StartsWith(device, DEVICE_GPU);
 }
 
 int NumOutputs(const NodeDef& node, GraphDef* graph) {
@@ -402,175 +410,6 @@ void EraseNodesFromGraph(const std::set<string>& nodes_to_delete,
   EraseNodesFromGraphImpl(nodes_idx_to_delete, graph);
 }
 
-Status SimpleGraphView::Initialize(
-    const GraphDef& graph,
-    const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
-        extra_dependencies,
-    bool dedup_inputs, bool dedup_outputs) {
-  graph_ = &graph;
-  const int num_nodes = graph.node_size();
-  inputs_.clear();
-  inputs_.resize(num_nodes);
-  outputs_.clear();
-  outputs_.resize(num_nodes);
-  name_to_index_.clear();
-  name_to_index_.reserve(num_nodes);
-  index_to_name_.clear();
-  index_to_name_.reserve(num_nodes);
-
-  // Build map from name to index and vice versa.
-  for (int node_idx = 0; node_idx < num_nodes; ++node_idx) {
-    const NodeDef& node = graph.node(node_idx);
-    name_to_index_.emplace(node.name(), node_idx);
-    index_to_name_.push_back(node.name());
-  }
-
-  if (extra_dependencies) {
-    for (const auto& dep : *extra_dependencies) {
-      auto itr_src = name_to_index_.find(dep.first->name());
-      if (itr_src == name_to_index_.end()) {
-        return errors::InvalidArgument("Non-existent src ", dep.first->name());
-      }
-      auto itr_tgt = name_to_index_.find(dep.second->name());
-      if (itr_tgt == name_to_index_.end()) {
-        return errors::InvalidArgument("Non-existent tgt ", dep.second->name());
-      }
-      const int src_idx = itr_src->second;
-      const int tgt_idx = itr_tgt->second;
-      inputs_[tgt_idx].push_back(src_idx);
-      outputs_[src_idx].push_back(tgt_idx);
-    }
-  }
-
-  // Build forward and reverse adjacency lists.
-  for (int node_idx = 0; node_idx < num_nodes; ++node_idx) {
-    const NodeDef& node = graph.node(node_idx);
-    inputs_[node_idx].reserve(node.input_size());
-    for (const string& input : node.input()) {
-      auto it = name_to_index_.find(NodeName(input));
-      if (it == name_to_index_.end()) {
-        return errors::InvalidArgument("Non-existent input ", input,
-                                       " for node ", node.name());
-      }
-      const int input_idx = it->second;
-      inputs_[node_idx].push_back(input_idx);
-      outputs_[input_idx].push_back(node_idx);
-    }
-    if (dedup_inputs) {
-      // Dedup the input list while it's still hot in cache.
-      STLSortAndRemoveDuplicates(&inputs_[node_idx]);
-    }
-  }
-
-  // Dedup outputs.
-  if (dedup_outputs) {
-    for (int node_idx = 0; node_idx < num_nodes; ++node_idx) {
-      STLSortAndRemoveDuplicates(&outputs_[node_idx]);
-    }
-  }
-  return Status::OK();
-}
-
-void SimpleGraphView::DepthFirstSearch(
-    const std::unordered_set<string>& op_types_to_traverse, int root_node,
-    std::set<int>* nodes_found,
-    SimpleGraphView::SearchDirection direction) const {
-  nodes_found->clear();
-  const string& op_type = graph_->node(root_node).op();
-  if (!op_types_to_traverse.empty() &&
-      op_types_to_traverse.find(op_type) == op_types_to_traverse.end()) {
-    return;
-  }
-  std::vector<int> stack;
-  stack.reserve(32);
-  stack.push_back(root_node);
-
-  auto push_neighbors = [&stack,
-                         &nodes_found](absl::Span<const int> neighbors) {
-    for (auto output_idx : neighbors) {
-      if (nodes_found->find(output_idx) == nodes_found->end()) {
-        stack.push_back(output_idx);
-      }
-    }
-  };
-
-  while (!stack.empty()) {
-    const int node_idx = stack.back();
-    stack.pop_back();
-    nodes_found->insert(node_idx);
-    const string& op_type = graph_->node(node_idx).op();
-    if (op_types_to_traverse.empty() ||
-        op_types_to_traverse.find(op_type) != op_types_to_traverse.end()) {
-      if (direction == kFollowOutputs) {
-        push_neighbors(this->outputs(node_idx));
-      } else {
-        push_neighbors(this->inputs(node_idx));
-      }
-    }
-  }
-}
-
-void SimpleGraphView::DepthFirstSearchWithCallback(
-    const std::unordered_set<string>& op_types_to_traverse, int node_idx,
-    SimpleGraphView::DFSCallback callback,
-    SimpleGraphView::SearchDirection direction) const {
-  std::set<int> nodes_found;
-  nodes_found.clear();
-  const string& op_type = graph_->node(node_idx).op();
-  if (!op_types_to_traverse.empty() &&
-      op_types_to_traverse.find(op_type) == op_types_to_traverse.end()) {
-    return;
-  }
-  std::vector<int> stack;
-  stack.reserve(32);
-  stack.push_back(node_idx);
-  auto push_neighbors = [&stack,
-                         &nodes_found](absl::Span<const int> neighbors) {
-    for (auto output_idx : neighbors) {
-      if (nodes_found.find(output_idx) == nodes_found.end()) {
-        stack.push_back(output_idx);
-      }
-    }
-  };
-  while (!stack.empty()) {
-    const int node_idx = stack.back();
-    stack.pop_back();
-    if (callback(graph_->node(node_idx))) {
-      return;
-    }
-    nodes_found.insert(node_idx);
-    const string& op_type = graph_->node(node_idx).op();
-    if (op_types_to_traverse.empty() ||
-        op_types_to_traverse.find(op_type) != op_types_to_traverse.end()) {
-      if (direction == kFollowOutputs) {
-        push_neighbors(this->outputs(node_idx));
-      } else {
-        push_neighbors(this->inputs(node_idx));
-      }
-    }
-  }
-}
-
-string SimpleGraphView::PrintToString() const {
-  string str;
-  for (int i = 0; i < num_nodes(); ++i) {
-    strings::StrAppend(&str, "Node ", i, "'", node_name(i), "'\n", "Inputs: [");
-    for (int input : inputs(i)) {
-      strings::StrAppend(&str, input, " '", node_name(input), "', ");
-    }
-    strings::StrAppend(&str, "]\n", "Outputs: [");
-    for (int j = 0; j < outputs(i).size(); ++j) {
-      const int output = outputs(i)[j];
-      if (j > 0) {
-        strings::StrAppend(&str, ", ");
-      }
-      strings::StrAppend(&str, output, " '", node_name(output), "'");
-    }
-    strings::StrAppend(&str, "]\n");
-  }
-  return str;
-}
-
 #define HANDLE_CASE(DTYPE)                                          \
   case DTYPE:                                                       \
     if (!SafeSetScalarTensorValue<EnumToDataType<DTYPE>::Type>(     \
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index a9d6abd6db952e69ebfab2ee16a50f501af1644c..9053ae4c07dae96c96bac416cf9e175c88462c33 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -242,6 +242,9 @@ string AsControlDependency(const string& node);
 // Returns true if the node is assigned to run on CPU device.
 bool NodeIsOnCpu(const NodeDef* node);
 
+// Returns true if the node is assigned to run on GPU device.
+bool NodeIsOnGpu(const NodeDef* node);
+
 // Returns the number of outputs of a node according to its OpDef. Note that
 // some of the outputs may be unconnected.
 int NumOutputs(const NodeDef& node, GraphDef* graph);
@@ -302,79 +305,6 @@ void EraseNodesFromGraph(std::vector<int>&& nodes_to_delete, GraphDef* graph);
 void EraseNodesFromGraph(const std::set<string>& nodes_to_delete,
                          GraphDef* graph);
 
-class SimpleGraphView {
- public:
-  // Build a graph view for the specified graphdef.
-  Status Initialize(const GraphDef& graph) {
-    return Initialize(graph, nullptr, true, true);
-  }
-  // Build a graph view for the specified graphdef augmented with the additional
-  // edges specified in 'extra_dependencies' if any. Note that
-  // extra_dependencies can be null.
-  Status Initialize(
-      const GraphDef& graph,
-      const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
-          extra_dependencies) {
-    return Initialize(graph, extra_dependencies, true, true);
-  }
-  Status Initialize(
-      const GraphDef& graph,
-      const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
-          extra_dependencies,
-      bool dedup_inputs, bool dedup_outputs);
-
-  const GraphDef* graph() const { return graph_; }
-  inline int num_nodes() const { return index_to_name_.size(); }
-  inline bool has_node(const string& node_name) const {
-    return name_to_index_.find(node_name) != name_to_index_.end();
-  }
-  inline const int index(const string& node_name) const {
-    const auto& it = name_to_index_.find(node_name);
-    DCHECK(it != name_to_index_.end());
-    return it == name_to_index_.end() ? -1 : it->second;
-  }
-  inline const NodeDef& node(int node_idx) const {
-    return graph_->node(node_idx);
-  }
-  inline const string& node_name(int node_idx) const {
-    return index_to_name_[node_idx];
-  }
-  inline const gtl::InlinedVector<int, 4>& inputs(int node_idx) const {
-    return inputs_[node_idx];
-  }
-  inline const gtl::InlinedVector<int, 2>& outputs(int node_idx) const {
-    return outputs_[node_idx];
-  }
-
-  enum SearchDirection { kFollowOutputs = 1, kFollowInputs = 2 };
-
-  // Traverse the graph starting at `node_idx`, collecting indices of nodes
-  // visited in nodes_found. If a node has an op in `op_types_to_traverse`, the
-  // walk continues to its children. It is assumed that *graph_ was not modified
-  // after the call to Initialize().
-  // If `op_types_to_traverse` is empty the DFS will traverse any node type.
-  void DepthFirstSearch(const std::unordered_set<string>& op_types_to_traverse,
-                        int node_idx, std::set<int>* nodes_found,
-                        SearchDirection direction = kFollowOutputs) const;
-
-  typedef std::function<bool(const NodeDef&)> DFSCallback;
-
-  // Like DepthFirstSearch, but invoke `callback` as each node is discovered. If
-  // `callback` returns true, the search is terminated early.
-  void DepthFirstSearchWithCallback(
-      const std::unordered_set<string>& op_types_to_traverse, int node_idx,
-      DFSCallback callback, SearchDirection direction = kFollowOutputs) const;
-
-  string PrintToString() const;
-
- private:
-  const GraphDef* graph_;  // Not owned.
-  std::vector<string> index_to_name_;
-  gtl::FlatMap<string, int> name_to_index_;
-  std::vector<gtl::InlinedVector<int, 4>> inputs_;
-  std::vector<gtl::InlinedVector<int, 2>> outputs_;
-};
-
 }  // end namespace grappler
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index c0f19d3828ac1581a937531318ff62875fbf3bc7..1fd0a02b65e3a212780b6fdabadce98833b3ebda 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -48,8 +48,11 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_topology_view",
+        "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -58,10 +61,11 @@ tf_cc_test(
     srcs = ["topological_sort_test.cc"],
     deps = [
         ":topological_sort",
-        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -101,8 +105,7 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/grappler:graph_view",
-        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core/grappler:graph_topology_view",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
@@ -116,6 +119,8 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -138,6 +143,7 @@ cc_library(
         "//tensorflow/core:test",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
+        "@com_google_absl//absl/algorithm:container",
     ],
 )
 
@@ -173,6 +179,10 @@ cc_library(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -191,6 +201,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index f2894a942bd3dac3e22748787eaa24717ed61555..150728d030361cab80dc6e759213c8591d082b92 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -14,8 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/grappler/utils/functions.h"
 
-#include <unordered_map>
-
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
@@ -28,7 +29,6 @@ limitations under the License.
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/scanner.h"
 
 namespace tensorflow {
@@ -76,16 +76,6 @@ Status ResolveFunctionBodyNodeAttrPlaceholders(
 
 }  // namespace
 
-FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
-    const FunctionLibraryDefinition& flib, const GraphDef& graph) {
-  return flib.ReachableDefinitions(graph);
-}
-
-FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
-    const FunctionLibraryDefinition& flib, const FunctionDef& func) {
-  return flib.ReachableDefinitions(func);
-}
-
 void GrapplerFunctionConnectivity::RegisterInputArgExpansion(
     InputArgExpansion input_arg_expansion) {
   string input_name = input_arg_expansion.input_name;
@@ -94,7 +84,7 @@ void GrapplerFunctionConnectivity::RegisterInputArgExpansion(
   for (int i = 0; i < placeholders.size(); ++i) {
     const string& placeholder = input_arg_expansion.placeholders[i];
     input_arg_placeholders_.insert(
-        {placeholder, InputArgPlaceholder{input_name, /*input_position=*/i}});
+        {placeholder, InputArgPlaceholder{input_name, /*input_index=*/i}});
   }
   input_arg_expansions_.insert(
       {std::move(input_name), std::move(input_arg_expansion)});
@@ -193,7 +183,7 @@ Status GrapplerFunctionConnectivity::ExpandFunctionDefInput(
           // If position is not defined expand node output range
           for (int i = output_range.first; i < output_range.second; ++i) {
             graph_def_inputs->push_back(
-                i == 0 ? node_name : strings::StrCat(node_name, ":", i));
+                i == 0 ? node_name : absl::StrCat(node_name, ":", i));
           }
         } else {
           if (position > (output_range.second - output_range.first)) {
@@ -203,7 +193,7 @@ Status GrapplerFunctionConnectivity::ExpandFunctionDefInput(
           }
           int pos = output_range.first + position;
           graph_def_inputs->push_back(
-              pos == 0 ? node_name : strings::StrCat(node_name, ":", pos));
+              pos == 0 ? node_name : absl::StrCat(node_name, ":", pos));
         }
 
         return Status::OK();
@@ -232,39 +222,39 @@ Status GrapplerFunctionConnectivity::ExpandNodeInputs(
 
 Status GrapplerFunctionConnectivity::AsFunctionDefInput(
     const string& graph_def_input, string* func_def_input) const {
-  using gtl::FindOrNull;
-
   if (IsControlInput(graph_def_input)) {
     *func_def_input = graph_def_input;
     return Status::OK();
   }
 
-  int position;
-  string node_name = ParseNodeName(graph_def_input, &position);
-  CHECK_GE(position, 0);
+  const TensorId tensor = ParseTensorName(graph_def_input);
+  DCHECK_GE(tensor.index(), 0);
+
+  const absl::string_view node_name = tensor.node();
+  const int index = tensor.index();
 
   // Check if it's an input arg placeholder
-  if (position == 0) {
-    const InputArgPlaceholder* placeholder =
-        FindOrNull(input_arg_placeholders_, node_name);
-    if (placeholder != nullptr) {
-      *func_def_input = strings::StrCat(placeholder->input_name, ":",
-                                        placeholder->input_position);
+  if (tensor.index() == 0) {
+    const auto is_input_placeholder = input_arg_placeholders_.find(node_name);
+    if (is_input_placeholder != input_arg_placeholders_.end()) {
+      const InputArgPlaceholder& placeholder = is_input_placeholder->second;
+      *func_def_input =
+          absl::StrCat(placeholder.input_name, ":", placeholder.input_index);
       return Status::OK();
     }
   }
 
   // It must be output from one of the function body nodes
-  const tensorflow::NameRangeMap* outputs_range_map =
-      FindOrNull(function_body_outputs_, node_name);
-  if (outputs_range_map != nullptr) {
-    for (const auto& el : *outputs_range_map) {
+  const auto is_body_output = function_body_outputs_.find(tensor.node());
+  if (is_body_output != function_body_outputs_.end()) {
+    const tensorflow::NameRangeMap& outputs_range_map = is_body_output->second;
+
+    for (const auto& el : outputs_range_map) {
       const auto& output_name = el.first;
       const auto& output_range = el.second;
-      if (position >= output_range.first && position < output_range.second) {
-        int pos = position - output_range.first;
-        *func_def_input =
-            strings::StrCat(node_name, ":", output_name, ":", pos);
+      if (index >= output_range.first && index < output_range.second) {
+        int pos = index - output_range.first;
+        *func_def_input = absl::StrCat(node_name, ":", output_name, ":", pos);
         return Status::OK();
       }
     }
@@ -338,13 +328,12 @@ GrapplerFunctionItem::GrapplerFunctionItem(
   for (const InputArgExpansion& input_arg : input_arg_expansions_) {
     for (const string& placeholder : input_arg.placeholders) {
       feed.push_back({placeholder, Tensor()});
-      input_arg_placeholders_.insert(placeholder);
     }
   }
   // Fill the fetch nodes with outputs.
   for (const OutputArgExpansion& output_arg : output_arg_expansions_) {
-    for (const string& output_tensor : output_arg.output_tensors) {
-      fetch.push_back(output_tensor);
+    for (const string& output_node : output_arg.output_nodes) {
+      fetch.push_back(output_node);
     }
   }
 
@@ -367,11 +356,6 @@ const std::size_t GrapplerFunctionItem::input_size() const {
   return input_arg_expansions_.size();
 }
 
-bool GrapplerFunctionItem::IsInputPlaceholder(const string& node_name) const {
-  return input_arg_placeholders_.find(node_name) !=
-         input_arg_placeholders_.end();
-}
-
 const std::vector<OutputArgExpansion>& GrapplerFunctionItem::outputs() const {
   return output_arg_expansions_;
 }
@@ -426,7 +410,7 @@ bool IsParametrized(const FunctionDef& func) {
 
 Status InstantiationTypeParameters(
     const FunctionDef& func, const AttrSlice& func_instantiation_attr,
-    std::unordered_map<string, DataType>* type_parameters) {
+    absl::flat_hash_map<string, DataType>* type_parameters) {
   if (!type_parameters->empty()) {
     return errors::InvalidArgument("Type parameters output map must be empty");
   }
@@ -454,7 +438,7 @@ Status InstantiationTypeParameters(
 
 Status InstantiationBodyParameters(
     const FunctionDef& func, const AttrSlice& func_instantiation_attr,
-    std::unordered_map<string, AttrValue>* body_parameters) {
+    absl::flat_hash_map<string, AttrValue>* body_parameters) {
   if (!body_parameters->empty()) {
     return errors::InvalidArgument("Body parameters output map must be empty");
   }
@@ -514,8 +498,7 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 
   // Function body shares the library with the graph that instantiated it. We do
   // not need a full copy of the function library, just the reachable subset.
-  *function_body.mutable_library() =
-      ReachableFunctionLibraryDefinition(flib, func).ToProto();
+  *function_body.mutable_library() = flib.ReachableDefinitions(func).ToProto();
 
   VLOG(3) << absl::Substitute(
       "Deleted $0 unreachable functions from the Grappler function item "
@@ -525,12 +508,18 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 
   // TODO(ezhulenev): support functions with tensor sequence inputs/outputs
 
-  // Make sure that there is no tensor sequences in outputs
+  // Make sure that there are no tensor lists in inputs or outputs.
+  for (const OpDef::ArgDef& input : signature.input_arg()) {
+    if (!input.type_list_attr().empty() || !input.number_attr().empty()) {
+      return errors::InvalidArgument(
+          "Inputs with lists of tensors are not supported. Input: ",
+          input.name());
+    }
+  }
   for (const OpDef::ArgDef& output : signature.output_arg()) {
     if (!output.type_list_attr().empty() || !output.number_attr().empty()) {
       return errors::InvalidArgument(
-          "Outputs with sequence of tensors are not supported. Unsupported "
-          "output: ",
+          "Outputs with lists of tensors are not supported. Output: ",
           output.name());
     }
   }
@@ -540,13 +529,6 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 
   // For each input argument create a placeholder in function body.
   for (const OpDef::ArgDef& input : signature.input_arg()) {
-    if (!input.type_list_attr().empty() || !input.number_attr().empty()) {
-      return errors::InvalidArgument(
-          "Inputs with sequence of tensors are not supported. Unsupported "
-          "input: ",
-          input.name());
-    }
-
     DataType input_data_type;
     TF_RETURN_IF_ERROR(instantiation.GetArgType(input, &input_data_type));
 
@@ -565,8 +547,25 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
     inputs.push_back(std::move(input_expansion));
   }
 
-  // Add all function nodes to the function body
+  // Keep names of all nodes in the function body to guarantee that we do not
+  // add an identity with a duplicate name.
+  absl::flat_hash_set<absl::string_view> func_body_nodes;
+
+  // Generate unique output node name: "${out_arg_name}_output_node_${index}".
+  const auto output_node_name = [&func_body_nodes](const OpDef::ArgDef& out,
+                                                   int index) -> string {
+    string name = absl::StrCat(out.name(), "_output_node_", index);
+    int i = 1;
+    while (func_body_nodes.find(name) != func_body_nodes.end()) {
+      name = absl::StrCat(out.name(), "_output_node_", index, "_", i++);
+    }
+    return name;
+  };
+
+  // Add all function nodes to the function body.
   for (const NodeDef& func_def_node : func.node_def()) {
+    func_body_nodes.insert(func_def_node.name());
+
     NodeDef* new_node = function_body.add_node();
     *new_node = func_def_node;
 
@@ -589,8 +588,13 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 
   std::vector<OutputArgExpansion> outputs;
   outputs.reserve(signature.output_arg_size());
-  // Add function outputs
+
+  // For each function output argument we create an Identity node in the
+  // function body, that reads output tensor from the function body node.
   for (const OpDef::ArgDef& out : signature.output_arg()) {
+    DataType output_data_type;
+    TF_RETURN_IF_ERROR(instantiation.GetArgType(out, &output_data_type));
+
     std::vector<string> output_tensors;
     auto ret = func.ret().find(out.name());
     TF_RETURN_IF_ERROR(
@@ -600,13 +604,23 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
             // Otherwise output must be one of the function inputs
             : connectivity.ExpandFunctionDefInput(out.name(), &output_tensors));
 
-    DataType output_data_type;
-    TF_RETURN_IF_ERROR(instantiation.GetArgType(out, &output_data_type));
+    absl::InlinedVector<string, 1> output_nodes;
+    for (int i = 0; i < output_tensors.size(); ++i) {
+      const string& output_tensor = output_tensors[i];
+
+      NodeDef* identity = function_body.add_node();
+      identity->set_name(output_node_name(out, i));
+      identity->set_op("Identity");
+      (*identity->mutable_attr())["T"].set_type(output_data_type);
+      identity->add_input(output_tensor);
+
+      output_nodes.push_back(identity->name());
+    }
 
     OutputArgExpansion output{/*output_name=*/out.name(),
                               /*data_type=*/output_data_type,
                               /*is_ref=*/out.is_ref(),
-                              /*output_tensors=*/std::move(output_tensors)};
+                              /*output_nodes=*/std::move(output_nodes)};
     outputs.push_back(std::move(output));
   }
 
@@ -645,7 +659,7 @@ Status RegisterGrapplerFunctionConnectivity(
   return Status::OK();
 }
 
-Status ReplaceInputWithConst(const NodeDef& input_const, int input_position,
+Status ReplaceInputWithConst(const NodeDef& input_const, int input_index,
                              GrapplerFunctionItem* item) {
   if (!IsConstant(input_const)) {
     return errors::InvalidArgument("Input node ", input_const.name(),
@@ -657,7 +671,7 @@ Status ReplaceInputWithConst(const NodeDef& input_const, int input_position,
   // Find input arg expansion and input placeholder position in it for the
   // given function input position.
   InputArgExpansion* input_arg_expansion = nullptr;
-  int placeholder_idx = input_position;
+  int placeholder_idx = input_index;
 
   for (InputArgExpansion& input : inputs) {
     if (placeholder_idx < input.placeholders.size()) {
@@ -668,14 +682,12 @@ Status ReplaceInputWithConst(const NodeDef& input_const, int input_position,
   }
 
   if (input_arg_expansion == nullptr) {
-    return errors::InvalidArgument(
-        "Input placeholder not found: input_position=", input_position,
-        " function=", item->id);
+    return errors::InvalidArgument("Input placeholder not found: input_index=",
+                                   input_index, " function=", item->id);
   }
 
   // Delete placeholder from input expansion.
   string placeholder_name = input_arg_expansion->placeholders[placeholder_idx];
-  item->input_arg_placeholders_.erase(placeholder_name);
   input_arg_expansion->placeholders.erase(
       input_arg_expansion->placeholders.begin() + placeholder_idx);
 
@@ -699,43 +711,46 @@ Status ReplaceInputWithConst(const NodeDef& input_const, int input_position,
   return Status::OK();
 }
 
-Status RemoveUnusedOutputs(const gtl::FlatSet<int>& active_outputs,
-                           GrapplerFunctionItem* item,
-                           std::vector<std::pair<int, int>>* output_mapping) {
+Status RemoveFunctionOutputs(const absl::flat_hash_set<int>& remove_outputs,
+                             GrapplerFunctionItem* item,
+                             std::vector<std::pair<int, int>>* output_mapping) {
   DCHECK(output_mapping->empty());
 
-  // Do some sanity checking of the active outputs positions.
-  for (int active_output : active_outputs) {
-    if (active_output < 0 || active_output >= item->output_size()) {
+  // Code below assumes that we do not support tensor list outputs and there is
+  // a 1-to-1 mapping between output tensor and output argument expansion.
+  for (const OutputArgExpansion& out_arg : item->outputs()) {
+    DCHECK(out_arg.output_nodes.size() == 1)
+        << "Output arg expansion must have single output";
+  }
+
+  // Do some sanity checking of the removed outputs positions.
+  for (int remove_output : remove_outputs) {
+    if (remove_output < 0 || remove_output >= item->output_size()) {
       return errors::InvalidArgument(
-          "Active output position is out of bound: active_output=",
-          active_output, " num_output_args=", item->output_size());
+          "Function output index is out of bound: index=", remove_output,
+          " max_output_index=", item->output_size());
     }
   }
 
-  gtl::FlatSet<const OutputArgExpansion*> unused_output_args;
-
-  const auto is_unused_output_arg = [&](const OutputArgExpansion& output) {
-    return unused_output_args.find(&output) != unused_output_args.end();
+  absl::flat_hash_set<const OutputArgExpansion*> remove_output_args;
+  const auto is_remove_output_arg = [&](const OutputArgExpansion& output) {
+    return remove_output_args.find(&output) != remove_output_args.end();
   };
 
   for (int i = 0; i < item->output_size(); ++i) {
     const OutputArgExpansion& output = item->output(i);
-    DCHECK(output.output_tensors.size() == 1)
-        << "Output arg expansion must have single tensor";
-
-    if (active_outputs.find(i) == active_outputs.end()) {
-      VLOG(3) << "Remove unused output: output_name=" << output.output_name
-              << " output_position=" << i;
-      unused_output_args.insert(&output);
-    } else if (!unused_output_args.empty()) {
+    if (remove_outputs.find(i) != remove_outputs.end()) {
+      VLOG(3) << "Remove functions output: output_name=" << output.output_name
+              << "(index = " << i << ")";
+      remove_output_args.insert(&output);
+    } else if (!remove_output_args.empty()) {
       // Add output mapping only if output position changed.
-      output_mapping->push_back({i, i - unused_output_args.size()});
+      output_mapping->push_back({i, i - remove_output_args.size()});
     }
   }
 
   auto& o = item->output_arg_expansions_;
-  o.erase(std::remove_if(o.begin(), o.end(), is_unused_output_arg), o.end());
+  o.erase(std::remove_if(o.begin(), o.end(), is_remove_output_arg), o.end());
 
   return Status::OK();
 }
@@ -747,6 +762,55 @@ Status MakeFunctionDef(const GrapplerFunctionItem& item,
   func->mutable_signature()->set_description(item.description());
   func->mutable_signature()->set_is_stateful(item.is_stateful());
 
+  // Keep track of placeholders that were added to the graph in place of
+  // expanded function input arguments.
+  absl::flat_hash_set<absl::string_view> input_placeholders;
+  for (const InputArgExpansion& input_arg : item.inputs()) {
+    for (const string& placeholder : input_arg.placeholders) {
+      input_placeholders.insert(placeholder);
+    }
+  }
+
+  // Keep track of identity nodes that were added to the graph in place of
+  // expanded function output arguments.
+  absl::flat_hash_set<absl::string_view> output_nodes;
+  for (const OutputArgExpansion& output_arg : item.outputs()) {
+    for (const string& output_node : output_arg.output_nodes) {
+      output_nodes.insert(output_node);
+    }
+  }
+
+  // If the output identity node was not modified by any optimizer, we can
+  // bypass it and returns the function value from its input.
+  absl::flat_hash_map<absl::string_view, string> output_tensors;
+  for (const NodeDef& func_body_node : item.function_body().node()) {
+    if (!IsIdentity(func_body_node)) continue;
+
+    const string& node_name = func_body_node.name();
+    if (output_nodes.find(node_name) != output_nodes.end()) {
+      // Grappler optimizers might optimize nodes in the fanin of the output
+      // node, and forward their control dependencies. We can't express control
+      // dependencies in a function signature, so we have to keep the node.
+      if (func_body_node.input_size() == 1) {
+        VLOG(3) << "Bypass function output node: " << node_name << " -> "
+                << func_body_node.input(0);
+        output_tensors.emplace(node_name, func_body_node.input(0));
+      } else {
+        VLOG(3) << "Keep function output node: " << node_name;
+      }
+    }
+  }
+
+  // Return output tensor name (input of the output node) if it's safe to bypass
+  // output node, otherwise returns the output node name.
+  const auto output_tensor =
+      [&output_tensors](const OutputArgExpansion& output_arg) -> const string& {
+    const string& output_node = output_arg.output_nodes[0];
+    const auto is_output_tensor = output_tensors.find(output_node);
+    return is_output_tensor == output_tensors.end() ? output_node
+                                                    : is_output_tensor->second;
+  };
+
   // Build a GrapplerFunctionConnectivity from inputs and new function body.
   GrapplerFunctionConnectivity connectivity;
   TF_RETURN_IF_ERROR(
@@ -754,8 +818,8 @@ Status MakeFunctionDef(const GrapplerFunctionItem& item,
 
   // Add function input arguments.
   for (const InputArgExpansion& input_arg : item.inputs()) {
-    CHECK(input_arg.placeholders.size() == 1)  // do some sanity checking
-        << "Inputs of tensor sequences are not supported";
+    DCHECK(input_arg.placeholders.size() == 1)  // do some sanity checking
+        << "Inputs of tensor lists are not supported";
 
     OpDef::ArgDef arg_def;
     arg_def.set_name(input_arg.input_name);
@@ -766,8 +830,8 @@ Status MakeFunctionDef(const GrapplerFunctionItem& item,
 
   // Add function output arguments.
   for (const OutputArgExpansion& output_arg : item.outputs()) {
-    CHECK(output_arg.output_tensors.size() == 1)  // do some sanity checking
-        << "Outputs of tensor sequences are not supported";
+    DCHECK(output_arg.output_nodes.size() == 1)  // do some sanity checking
+        << "Outputs of tensor lists are not supported";
 
     OpDef::ArgDef arg_def;
     arg_def.set_name(output_arg.output_name);
@@ -775,11 +839,9 @@ Status MakeFunctionDef(const GrapplerFunctionItem& item,
     arg_def.set_is_ref(output_arg.is_ref);
     *func->mutable_signature()->add_output_arg() = arg_def;
 
-    string ret;
-    for (const string& output_tensor : output_arg.output_tensors) {
-      TF_RETURN_IF_ERROR(connectivity.AsFunctionDefInput(output_tensor, &ret));
-      (*func->mutable_ret())[output_arg.output_name] = ret;
-    }
+    TF_RETURN_IF_ERROR(connectivity.AsFunctionDefInput(
+        output_tensor(output_arg),
+        &(*func->mutable_ret())[output_arg.output_name]));
   }
 
   // Copy function definition specific attributes.
@@ -790,12 +852,16 @@ Status MakeFunctionDef(const GrapplerFunctionItem& item,
   }
 
   // Copy function body nodes to the FunctionDef and update input format
-  for (const NodeDef& func_body_node : item.function_body().node()) {
-    // Do not copy input placeholders
-    if (item.IsInputPlaceholder(func_body_node.name())) continue;
+  for (const NodeDef& func_node : item.function_body().node()) {
+    const string& name = func_node.name();
+
+    // Do not copy input placeholders.
+    if (IsPlaceholder(func_node) && input_placeholders.count(name)) continue;
+    // Do not copy output nodes that we bypassed.
+    if (IsIdentity(func_node) && output_tensors.count(name)) continue;
 
     NodeDef* func_def_node = func->add_node_def();
-    *func_def_node = func_body_node;
+    *func_def_node = func_node;
     TF_RETURN_IF_ERROR(connectivity.AsFunctionDefNode(func_def_node));
   }
 
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index 038cf5f527e0f32cc10e123bb0cab357e5902463..d5a41e74739d67fc2cef0c295efe208edbd6255c 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -18,7 +18,10 @@ limitations under the License.
 
 #include <memory>
 #include <string>
-#include <unordered_map>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -30,12 +33,20 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-// Returns a copy of FunctionLibraryDefinition with subset of functions that are
-// reachable from the nodes of the graph.
-FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
-    const FunctionLibraryDefinition& flib, const GraphDef& graph);
-FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
-    const FunctionLibraryDefinition& flib, const FunctionDef& func);
+// WARNING(ezhulenev): Currently we do not support functions with inputs or
+// outputs instantiated into multiple tensors. This can happen if the
+// input/output type is 'T*N' or 'list(type)'. This is enforced by multiple
+// checks across this file and also function_optimizer.cc. InputArgExpansion and
+// OutputArgExpansion already support lists of tensors, but that's pretty much
+// it, all other code is written with assumption that expansions are always of
+// size 1. MakeGrapplerFunctionItem will gracefully fail with Status error.
+//
+// This is a low priority feature, because in practice we don't see a lot (any
+// at all?) functions with such arguments. Tensorflow-Eager always produces
+// functions with plain input/output arguments.
+
+// TODO(ezhulenev): Support inputs and outputs of type 'T*N'.
+// TODO(ezhulenev): Support inputs and outputs of type 'list(type)'.
 
 // Depending on the function instantiation attributes, input argument to the
 // function might be a single tensor, list of tensors of the same type, or a
@@ -44,30 +55,23 @@ FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
 // InputArgExpansion keeps track of the placeholders that were added to the
 // function body in place of function inputs and a resolved input data type.
 struct InputArgExpansion {
-  // TODO(ezhulenev): Add support for functions with tensor sequence inputs of
-  // different data types.
-  // TODO(ezhulenev): Support type parametrized inputs?
-  string input_name;                 // name of the function input argument
-  DataType data_type;                // input data type
-  bool is_ref;                       // if true, inputs are required to be refs
-  std::vector<string> placeholders;  // names of placeholder nodes in the
-                                     // function body
+  string input_name;
+  DataType data_type;
+  bool is_ref;
+  absl::InlinedVector<string, 1> placeholders;
 };
 
 // Depending on the function instantiation attributes, output argument is mapped
 // to one or more outputs of one of the function body nodes.
 //
-// OutputArgExpansion keeps mapping from a function output arg to the output
-// tensors of a function body nodes and a resolved output data type
+// OutputArgExpansion keeps track of the Identity nodes that were added to the
+// function body to forward output tensors. Adding these output nodes allows
+// nested function inlining and specialization (see function optimizer).
 struct OutputArgExpansion {
-  // TODO(ezhulenev): Add support for functions with tensor sequence outputs of
-  // different data types.
-  // TODO(ezhulenev): Support type parametrized outputs?
-  string output_name;                  // name of the function output argument
-  DataType data_type;                  // output data type
-  bool is_ref;                         // if true, outputs are refs
-  std::vector<string> output_tensors;  // names of output tensor from the
-                                       // function body nodes
+  string output_name;
+  DataType data_type;
+  bool is_ref;
+  absl::InlinedVector<string, 1> output_nodes;
 };
 
 // FunctionDef uses different connectivity encoding for the function body nodes,
@@ -81,44 +85,46 @@ class GrapplerFunctionConnectivity {
   void RegisterFunctionBodyOutputs(const string& node_name,
                                    tensorflow::NameRangeMap&& outputs);
 
-  // Expand input encoded in FunctionDef format (name[:output][:position]) into
+  // Expands input encoded in FunctionDef format (name[:output][:position]) into
   // multiple inputs in GraphDef format (name[:position]).
   Status ExpandFunctionDefInput(const string& func_def_input,
                                 std::vector<string>* graph_def_inputs) const;
 
-  // Update Node inputs from FunctionDef to GraphDef format.
+  // Updates Node inputs from FunctionDef to GraphDef format.
   Status ExpandNodeInputs(NodeDef* function_body_node) const;
 
   // When expanding inputs in function def format, single input might be
   // expanded into multiple tensors. When converting back to the function def
   // format from graph def format, it's always a 1-to-1 relationship.
-  // FunctionDef built from GrapplerFunctionItem is always specialized to it's
+  // FunctionDef built from GrapplerFunctionItem is always specialized to its
   // instantiation attributes and length of input args (and node def outputs) is
   // known.
 
-  // Map from GraphDef input format to FunctionDef input format using registered
-  // input arg expansion and function body outputs.
+  // Converts input name from GraphDef format (name[:position]) to the
+  // FunctionDef input format (name[:output][:position]) using registered input
+  // arg expansion and function body outputs.
   Status AsFunctionDefInput(const string& graph_def_input,
                             string* func_def_input) const;
 
-  // Update Node inputs from GraphDef to FunctionDef format.
+  // Updates Node inputs from GraphDef to FunctionDef format.
   Status AsFunctionDefNode(NodeDef* function_body_node) const;
 
  private:
   // Mapping from input name to input arg expansion.
-  std::unordered_map<string, InputArgExpansion> input_arg_expansions_;
+  absl::flat_hash_map<string, InputArgExpansion> input_arg_expansions_;
   // Mapping from function body node name to output names range map.
-  std::unordered_map<string, tensorflow::NameRangeMap> function_body_outputs_;
+  absl::flat_hash_map<string, tensorflow::NameRangeMap> function_body_outputs_;
 
+  // For each placeholder added to the function instantiation graph, we keep a
+  // mapping back to the function input argument name and index.
   struct InputArgPlaceholder {
-    string input_name;   // Name of the function input argument.
-    int input_position;  // Index of a tensor in the function input argument
-                         // expansion, it can be greater than `0` if input
-                         // argument is a list of tensors (aka list(type)).
+    string input_name;  // Name of the function input argument.
+    int input_index;    // Index of a tensor in the function input argument
+                        // expansion, it can be greater than `0` if input
+                        // argument is a list of tensors (aka list(type)).
   };
-
   // Mapping from input arg placeholder to the function input tensor.
-  std::unordered_map<string, InputArgPlaceholder> input_arg_placeholders_;
+  absl::flat_hash_map<string, InputArgPlaceholder> input_arg_placeholders_;
 };
 
 // Get Function type attributes using attributes of a node that instantiated
@@ -147,8 +153,6 @@ class GrapplerFunctionItem : public GrapplerItem {
 
   const string& description() const;
 
-  bool IsInputPlaceholder(const string& node_name) const;
-
   const std::vector<InputArgExpansion>& inputs() const;
   const InputArgExpansion& input(int i) const;
   const std::size_t input_size() const;
@@ -171,9 +175,9 @@ class GrapplerFunctionItem : public GrapplerItem {
                                          GrapplerFunctionItem*);
   friend Status ReplaceInputWithConst(const NodeDef&, int,
                                       GrapplerFunctionItem*);
-  friend Status RemoveUnusedOutputs(
-      const gtl::FlatSet<int>& active_outputs, GrapplerFunctionItem* item,
-      std::vector<std::pair<int, int>>* output_mapping);
+  friend Status RemoveFunctionOutputs(const absl::flat_hash_set<int>&,
+                                      GrapplerFunctionItem*,
+                                      std::vector<std::pair<int, int>>*);
 
   GrapplerFunctionItem(string func_name, string description,
                        AttrSlice func_attr,
@@ -189,16 +193,14 @@ class GrapplerFunctionItem : public GrapplerItem {
   std::vector<InputArgExpansion> input_arg_expansions_;
   std::vector<OutputArgExpansion> output_arg_expansions_;
 
-  std::set<string> input_arg_placeholders_;
-
-  bool is_stateful_;
+  bool is_stateful_ = false;
 };
 
 // Check if function input/output types are fully defined only at instantiation
-// time (parametrized by it's instantiation node).
+// time (parametrized by its instantiation node).
 bool HasParametrizedType(const FunctionDef& func);
 
-// Check if a function body is parametrized by it's instantiation node. Function
+// Check if a function body is parametrized by its instantiation node. Function
 // body is parametrized, if it has at least one node with a 'placeholder'
 // attribute.
 bool HasParametrizedBody(const FunctionDef& func);
@@ -210,14 +212,14 @@ bool IsParametrized(const FunctionDef& func);
 // caller node. Return error if type can't be resolved.
 Status InstantiationTypeParameters(
     const FunctionDef& func, const AttrSlice& func_instantiation_attr,
-    std::unordered_map<string, DataType>* type_parameters);
+    absl::flat_hash_map<string, DataType>* type_parameters);
 
 // Resolve function instantiation body parameters (values for the function body
 // attr placeholders) from the attributes of the caller node. Return error if
 // type can't be resolved.
 Status InstantiationBodyParameters(
     const FunctionDef& func, const AttrSlice& func_instantiation_attr,
-    std::unordered_map<string, AttrValue>* body_parameters);
+    absl::flat_hash_map<string, AttrValue>* body_parameters);
 
 // Register GrapplerFunctionItem input arg expansion and function body outputs
 // in the GrapplerFunctionConnectivity. Use function library definition to
@@ -227,18 +229,19 @@ Status RegisterGrapplerFunctionConnectivity(
     GrapplerFunctionConnectivity* connectivity);
 
 // Replace one of the function inputs with a constant.
-Status ReplaceInputWithConst(const NodeDef& input_const, int input_position,
+Status ReplaceInputWithConst(const NodeDef& input_const, int input_index,
                              GrapplerFunctionItem* item);
 
-// Remove function output arguments that do not have any active outputs (output
-// tensor connected to other node inputs or in a fetch set). Active outputs uses
-// GraphDef output position encoding, and multiple active outputs could
-// potentially be connected to the same output argument (in case of tensor list
-// outputs). Add output mapping for all active outputs that changed it's output
-// position (std::pair<old position, new position>).
-Status RemoveUnusedOutputs(const gtl::FlatSet<int>& active_outputs,
-                           GrapplerFunctionItem* item,
-                           std::vector<std::pair<int, int>>* output_mapping);
+// Removes outputs from instantiated grappler function item. Function node
+// outputs use GraphDef output index encoding, and multiple outputs might belong
+// to the same output argument expansion (in case of tensor list outputs). For
+// all active function outputs that changed its output index, this function adds
+// an output mapping (std::pair<old index, new index>).
+Status RemoveFunctionOutputs(const absl::flat_hash_set<int>& remove_outputs,
+                             GrapplerFunctionItem* item,
+                             std::vector<std::pair<int, int>>* output_mapping);
+
+// TODO(ezhulennev, b/120103818): Add RemoveFunctionInputs.
 
 // Make a GrapplerFunctionItem from the function definition and function
 // instantiation attributes (caller node attributes). Returns error if the given
@@ -253,7 +256,7 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 // fully defined (no type or body parametrization).
 // TODO(ezhulenev): Support parametrized functions without fully defined
 // instantiation attributes? Do we ever want to optimize parametrized function
-// without specializing it to it's instantiation attributes (at least types)?
+// without specializing it to its instantiation attributes (at least types)?
 Status MakeGrapplerFunctionItem(const FunctionDef& func,
                                 const FunctionLibraryDefinition& flib,
                                 int graph_def_version,
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index 5923850eca65a219fe3c452947751509a2bcf445..c49920c79cca93c8b592ce744926f06251a499ad 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/utils/functions.h"
+
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -77,7 +79,7 @@ TEST_F(FunctionsTest, InstantiationParameters) {
   func_instantiation_attr["B"].set_type(DT_INT32);
   func_instantiation_attr["C"].set_type(DT_DOUBLE);
 
-  std::unordered_map<string, DataType> type_parameters;
+  absl::flat_hash_map<string, DataType> type_parameters;
   TF_EXPECT_OK(InstantiationTypeParameters(
       func, AttrSlice(&func_instantiation_attr), &type_parameters));
 
@@ -86,7 +88,7 @@ TEST_F(FunctionsTest, InstantiationParameters) {
   EXPECT_EQ(DT_INT32, type_parameters["B"]);
   EXPECT_EQ(DT_DOUBLE, type_parameters["C"]);
 
-  std::unordered_map<string, AttrValue> body_parameters;
+  absl::flat_hash_map<string, AttrValue> body_parameters;
   TF_EXPECT_OK(InstantiationBodyParameters(
       func, AttrSlice(&func_instantiation_attr), &body_parameters));
 
@@ -247,15 +249,16 @@ TEST_F(FunctionsTest, FromSimpleFunctionDef) {
                                         flib, TF_GRAPH_DEF_VERSION, &item));
 
   EXPECT_EQ("XTimesTwo", item.id);
-  EXPECT_EQ(4, item.function_body().node_size());
+  EXPECT_EQ(5, item.function_body().node_size());
 
   EXPECT_EQ(1, item.input_size());
   EXPECT_EQ("x", item.input(0).input_name);
-  EXPECT_EQ(std::vector<string>{"x"}, item.input(0).placeholders);
+  ASSERT_EQ(1, item.input(0).placeholders.size());
+  EXPECT_EQ("x", item.input(0).placeholders[0]);
 
   EXPECT_EQ(1, item.output_size());
   EXPECT_EQ("y", item.output(0).output_name);
-  EXPECT_EQ("y", item.output(0).output_tensors[0]);
+  EXPECT_EQ("y_output_node_0", item.output(0).output_nodes[0]);
 
   int count = 0;
   for (const NodeDef &node : item.function_body().node()) {
@@ -277,9 +280,13 @@ TEST_F(FunctionsTest, FromSimpleFunctionDef) {
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("x", node.input(0));
       EXPECT_EQ("scale", node.input(1));
+    } else if (node.name() == "y_output_node_0" && ++count) {
+      EXPECT_EQ("Identity", node.op());
+      ASSERT_EQ(1, node.input_size());
+      EXPECT_EQ("y", node.input(0));
     }
   }
-  EXPECT_EQ(4, count);
+  EXPECT_EQ(5, count);
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithMultiOutputNodes) {
@@ -324,7 +331,7 @@ TEST_F(FunctionsTest, FromFunctionDefWithMultiOutputNodes) {
                                         flib, TF_GRAPH_DEF_VERSION, &item));
 
   EXPECT_EQ("SubGrad", item.id);
-  EXPECT_EQ(12, item.function_body().node_size());
+  EXPECT_EQ(14, item.function_body().node_size());
 
   ASSERT_EQ(3, item.input_size());
   EXPECT_EQ("x", item.input(0).input_name);
@@ -332,8 +339,8 @@ TEST_F(FunctionsTest, FromFunctionDefWithMultiOutputNodes) {
   EXPECT_EQ("dz", item.input(2).input_name);
 
   ASSERT_EQ(2, item.output_size());
-  EXPECT_EQ("dx", item.output(0).output_tensors[0]);
-  EXPECT_EQ("dy", item.output(1).output_tensors[0]);
+  EXPECT_EQ("dx_output_node_0", item.output(0).output_nodes[0]);
+  EXPECT_EQ("dy_output_node_0", item.output(1).output_nodes[0]);
 
   int count = 0;
   for (const NodeDef &node : item.function_body().node()) {
@@ -357,9 +364,17 @@ TEST_F(FunctionsTest, FromFunctionDefWithMultiOutputNodes) {
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("gy", node.input(0));
       EXPECT_EQ("rx:1", node.input(1));
+    } else if (node.name() == "dx_output_node_0" && ++count) {
+      EXPECT_EQ("Identity", node.op());
+      ASSERT_EQ(1, node.input_size());
+      EXPECT_EQ("dx", node.input(0));
+    } else if (node.name() == "dy_output_node_0" && ++count) {
+      EXPECT_EQ("Identity", node.op());
+      ASSERT_EQ(1, node.input_size());
+      EXPECT_EQ("dy", node.input(0));
     }
   }
-  EXPECT_EQ(6, count);
+  EXPECT_EQ(8, count);
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithNestedFuncs) {
@@ -470,7 +485,7 @@ TEST_F(FunctionsTest, FromFunctionDefWithOutputMappings) {
                                         flib, TF_GRAPH_DEF_VERSION, &item));
 
   EXPECT_EQ(1, item.output_size());
-  EXPECT_EQ("Exp", item.output(0).output_tensors[0]);
+  EXPECT_EQ("out_output_node_0", item.output(0).output_nodes[0]);
 
   int count = 0;
   for (const NodeDef &node : item.function_body().node()) {
@@ -486,9 +501,13 @@ TEST_F(FunctionsTest, FromFunctionDefWithOutputMappings) {
       EXPECT_EQ("Exp", node.op());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("Linear_func", node.input(0));
+    } else if (node.name() == "out_output_node_0" && ++count) {
+      EXPECT_EQ("Identity", node.op());
+      ASSERT_EQ(1, node.input_size());
+      EXPECT_EQ("Exp", node.input(0));
     }
   }
-  EXPECT_EQ(3, count);
+  EXPECT_EQ(4, count);
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithInputForwarding) {
@@ -515,27 +534,44 @@ TEST_F(FunctionsTest, FromFunctionDefWithInputForwarding) {
                                         flib, TF_GRAPH_DEF_VERSION, &item));
 
   EXPECT_EQ("ForwardInputs", item.id);
-  EXPECT_EQ(5, item.function_body().node_size());
+  EXPECT_EQ(8, item.function_body().node_size());
 
   EXPECT_EQ(3, item.output_size());
-  EXPECT_EQ("in0", item.output(0).output_tensors[0]);
-  EXPECT_EQ("arg2", item.output(1).output_tensors[0]);
-  EXPECT_EQ("arg3", item.output(2).output_tensors[0]);
+  EXPECT_EQ("out0_output_node_0", item.output(0).output_nodes[0]);
+  EXPECT_EQ("arg2_output_node_0", item.output(1).output_nodes[0]);
+  EXPECT_EQ("arg3_output_node_0", item.output(2).output_nodes[0]);
 
   int count = 0;
+
+  const auto is_arg_placeholder = [](const string &name) {
+    return name == "in0" || name == "in1" || name == "arg2" || name == "arg3" ||
+           name == "arg4";
+  };
+
   for (const NodeDef &node : item.function_body().node()) {
-    EXPECT_TRUE(node.name() == "in0" || node.name() == "in1" ||
-                node.name() == "arg2" || node.name() == "arg3" ||
-                node.name() == "arg4");
-    count++;
-    EXPECT_EQ("Placeholder", node.op());
-    if (node.name() == "arg3") {
-      EXPECT_EQ(DT_INT32, node.attr().at("dtype").type());
-    } else {
-      EXPECT_EQ(DT_FLOAT, node.attr().at("dtype").type());
+    if (is_arg_placeholder(node.name()) && node.op() == "Placeholder") {
+      count++;
+      if (node.name() == "arg3") {
+        EXPECT_EQ(DT_INT32, node.attr().at("dtype").type());
+      } else {
+        EXPECT_EQ(DT_FLOAT, node.attr().at("dtype").type());
+      }
+      continue;
+    }
+
+    EXPECT_EQ("Identity", node.op());
+    ASSERT_EQ(1, node.input_size());
+    EXPECT_TRUE(is_arg_placeholder(node.input(0)));
+
+    if (node.name() == "out0_output_node_0" && ++count) {
+      EXPECT_EQ("in0", node.input(0));
+    } else if (node.name() == "arg2_output_node_0" && ++count) {
+      EXPECT_EQ("arg2", node.input(0));
+    } else if (node.name() == "arg3_output_node_0" && ++count) {
+      EXPECT_EQ("arg3", node.input(0));
     }
   }
-  EXPECT_EQ(5, count);
+  EXPECT_EQ(8, count);
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithoutInput) {
@@ -564,16 +600,22 @@ TEST_F(FunctionsTest, FromFunctionDefWithoutInput) {
 
   EXPECT_EQ(0, item.input_size());
   EXPECT_EQ(1, item.output_size());
-  EXPECT_EQ("o", item.output(0).output_tensors[0]);
+  EXPECT_EQ("o_output_node_0", item.output(0).output_nodes[0]);
+  EXPECT_EQ(3, item.function_body().node_size());
 
-  EXPECT_EQ(2, item.function_body().node_size());
   const NodeDef &two = item.function_body().node(0);
   EXPECT_EQ("two", two.name());
   EXPECT_EQ(0, two.input_size());
+
   const NodeDef &cast = item.function_body().node(1);
   EXPECT_EQ("o", cast.name());
   EXPECT_EQ(1, cast.input_size());
   EXPECT_EQ("two", cast.input(0));
+
+  const NodeDef &retval = item.function_body().node(2);
+  EXPECT_EQ("o_output_node_0", retval.name());
+  EXPECT_EQ(1, retval.input_size());
+  EXPECT_EQ("o", retval.input(0));
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithSideEffectfulOps) {
@@ -672,7 +714,7 @@ TEST_F(FunctionsTest, ReplaceInputWithConst) {
   EXPECT_EQ(2, item.input_size());
   EXPECT_EQ(1, item.output_size());
 
-  ASSERT_EQ(3, item.function_body().node_size());
+  ASSERT_EQ(4, item.function_body().node_size());
 
   const NodeDef &input_x = item.function_body().node(0);
   const NodeDef &input_y = item.function_body().node(1);
@@ -746,8 +788,9 @@ TEST_F(FunctionsTest, SwapFunctionBodyAndMakeFunctionDef) {
       {{"z", "output:z:0"}});
 
   GraphDef id_func_body = test::function::GDef(
-      {/* pass input to output through identity */
-       NDef("output", "Identity", {"x"}, {{"T", "float"}})});
+      {/* Read and return input argument through Identity node. */
+       NDef("read_x", "Identity", {"x"}, {{"T", "float"}}),
+       NDef("z_output_node_0", "Identity", {"read_x"}, {{"T", "float"}})});
 
   protobuf::Map<string, AttrValue> func_instantiation_attr;
   func_instantiation_attr["T"].set_type(DT_FLOAT);
@@ -770,15 +813,15 @@ TEST_F(FunctionsTest, SwapFunctionBodyAndMakeFunctionDef) {
   // Check that graph body was updated.
   int count = 0;
   for (const NodeDef &node : specialized.node_def()) {
-    if (node.name() == "output" && ++count) {
+    if (node.name() == "read_x" && ++count) {
       EXPECT_EQ("Identity", node.op());
       EXPECT_EQ("x:0", node.input(0));
     }
   }
   EXPECT_EQ(1, count);
 
-  // And return tensor mapping was updated with a new output name (z->output).
-  EXPECT_EQ("output:output:0", (*specialized.mutable_ret())["z"]);
+  // And return tensor mapping was updated with a new output name (z->read_x).
+  EXPECT_EQ("read_x:output:0", (*specialized.mutable_ret())["z"]);
 }
 
 TEST_F(FunctionsTest, FunctionDefGrapplerFunctionItemRoundTrip) {
diff --git a/tensorflow/core/grappler/utils/grappler_test.cc b/tensorflow/core/grappler/utils/grappler_test.cc
index 576494cad55e22ba8457f30d0ea79b53f6f5de78..1b4b9f9a51af17c4472f0fc34331b75192e3d3ae 100644
--- a/tensorflow/core/grappler/utils/grappler_test.cc
+++ b/tensorflow/core/grappler/utils/grappler_test.cc
@@ -14,7 +14,11 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/utils/grappler_test.h"
+
 #include <memory>
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
@@ -23,6 +27,46 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+namespace {
+void CompareGraphNodes(protobuf::RepeatedPtrField<NodeDef>* want,
+                       protobuf::RepeatedPtrField<NodeDef>* got) {
+  auto comparator = [](const NodeDef& n1, const NodeDef& n2) -> bool {
+    return n1.name() < n2.name();
+  };
+
+  std::sort(want->begin(), want->end(), comparator);
+  std::sort(got->begin(), got->end(), comparator);
+
+  ASSERT_EQ(want->size(), got->size());
+
+  for (int i = 0; i < want->size(); ++i) {
+    NodeDef& want_node = (*want)[i];
+    NodeDef& got_node = (*got)[i];
+
+    EXPECT_EQ(want_node.op(), got_node.op());
+    EXPECT_EQ(want_node.name(), got_node.name());
+    EXPECT_EQ(want_node.device(), got_node.device());
+    ASSERT_EQ(want_node.input_size(), got_node.input_size());
+
+    // Order of control dependencies doesn't matter, so we sort them first.
+    const auto is_control = [](const string& input) -> bool {
+      return ParseTensorName(input).index() < 0;
+    };
+
+    auto want_inputs = want_node.mutable_input();
+    auto got_inputs = got_node.mutable_input();
+    std::sort(absl::c_find_if(*want_inputs, is_control), want_inputs->end());
+    std::sort(absl::c_find_if(*got_inputs, is_control), got_inputs->end());
+
+    for (int j = 0; j < want_node.input_size(); ++j) {
+      const TensorId want_tensor = ParseTensorName(want_node.input(j));
+      const TensorId got_tensor = ParseTensorName(got_node.input(j));
+      EXPECT_EQ(want_tensor.ToString(), got_tensor.ToString());
+    }
+  }
+}
+}  // namespace
+
 GrapplerTest::GrapplerTest() {
   // Turn off all the automatic optimizations to ensure that we run the graph
   // exactly as it is given to us. This ensures that we can compare the results
@@ -94,34 +138,35 @@ NodeDef* GrapplerTest::AddNode(
 }
 
 void GrapplerTest::CompareGraphs(GraphDef want, GraphDef got) const {
-  auto comparator = [](const NodeDef& n1, const NodeDef& n2) -> bool {
-    return n1.name() < n2.name();
-  };
-  std::sort(want.mutable_node()->begin(), want.mutable_node()->end(),
-            comparator);
-  std::sort(got.mutable_node()->begin(), got.mutable_node()->end(), comparator);
+  CompareGraphNodes(want.mutable_node(), got.mutable_node());
+}
 
-  for (int i = 0; i < want.node_size(); ++i) {
-    std::sort(want.mutable_node(i)->mutable_input()->begin(),
-              want.mutable_node(i)->mutable_input()->end());
-  }
-  for (int i = 0; i < got.node_size(); ++i) {
-    std::sort(got.mutable_node(i)->mutable_input()->begin(),
-              got.mutable_node(i)->mutable_input()->end());
-  }
+void GrapplerTest::CompareFunctions(FunctionDef want, FunctionDef got) const {
+  CompareGraphNodes(want.mutable_node_def(), got.mutable_node_def());
+}
 
-  ASSERT_EQ(want.node_size(), got.node_size());
-  for (int i = 0; i < want.node_size(); ++i) {
-    EXPECT_EQ(want.node(i).op(), got.node(i).op());
-    EXPECT_EQ(want.node(i).name(), got.node(i).name());
-    EXPECT_EQ(want.node(i).device(), got.node(i).device());
+void GrapplerTest::CompareNodes(const NodeDef& want, const NodeDef& got) const {
+  EXPECT_EQ(want.name(), got.name());
+  EXPECT_EQ(want.op(), got.op());
 
-    ASSERT_EQ(want.node(i).input_size(), got.node(i).input_size());
-    for (int j = 0; j < want.node(i).input_size(); ++j) {
-      const TensorId want_tensor = ParseTensorName(want.node(i).input(j));
-      const TensorId got_tensor = ParseTensorName(got.node(i).input(j));
-      EXPECT_EQ(want_tensor.ToString(), got_tensor.ToString());
-    }
+  std::vector<string> want_inputs(want.input().begin(), want.input().end());
+  std::vector<string> got_inputs(got.input().begin(), got.input().end());
+  EXPECT_EQ(want_inputs, got_inputs);
+
+  const auto attr_name = [](const std::pair<const string, AttrValue>& attr) {
+    return attr.first;
+  };
+
+  std::vector<string> want_attrs;
+  std::vector<string> got_attrs;
+  absl::c_transform(want.attr(), std::back_inserter(want_attrs), attr_name);
+  absl::c_transform(got.attr(), std::back_inserter(got_attrs), attr_name);
+  absl::c_sort(want_attrs);
+  absl::c_sort(got_attrs);
+  EXPECT_EQ(want_attrs, got_attrs);
+
+  for (const string& attr : want_attrs) {
+    EXPECT_TRUE(AreAttrValuesEqual(want.attr().at(attr), got.attr().at(attr)));
   }
 }
 
diff --git a/tensorflow/core/grappler/utils/grappler_test.h b/tensorflow/core/grappler/utils/grappler_test.h
index 0cfd740dcbe15e0571bc159858c0ed33c2071cb8..26c1db37405a48a7252f388a3e659b8d07c569ae 100644
--- a/tensorflow/core/grappler/utils/grappler_test.h
+++ b/tensorflow/core/grappler/utils/grappler_test.h
@@ -49,13 +49,32 @@ class GrapplerTest : public ::testing::Test {
                    const std::vector<std::pair<string, AttrValue>>& attributes,
                    GraphDef* graph) const;
 
+  // Checks if two graphs are equal. Both graphs must have the same set of nodes
+  // with the same inputs and attributes. Nodes can be in different order.
+  //
+  // NOTE: This function uses EXPECT/ASSERT macros to check node properties
+  // equality, and adds all failuires to the current test.
   void CompareGraphs(GraphDef want, GraphDef got) const;
 
-  // Check if node 'src' is directly connected to the input($position) of 'dst'.
+  // Checks if two nodes have the same name, op, inputs and attributes.
+  //
+  // NOTE: This function uses EXPECT/ASSERT macros to check node properties
+  // equality, and adds all failuires to the current test.
+  void CompareNodes(const NodeDef& want, const NodeDef& got) const;
+
+  // Checks if two functions are equal. Both functions must have the same set of
+  // nodes with the same inputs and attributes. Nodes can be in different order.
+  //
+  // NOTE: This function uses EXPECT/ASSERT macros to check node properties
+  // equality, and adds all failures to the current test.
+  void CompareFunctions(FunctionDef want, FunctionDef got) const;
+
+  // Checks if node 'src' is directly connected to the input($position) of
+  // 'dst'.
   bool IsNodesDirectlyConnected(const NodeMap& node_map, const string& src,
                                 const string& dst, int position = 0);
 
-  // Count nodes of the given op-type in a graph.
+  // Counts nodes of the given op-type in a graph.
   int CountOpNodes(const GraphDef& graph, const string& op);
 
   // Get a random tensor with given shape.
diff --git a/tensorflow/core/grappler/utils/topological_sort.cc b/tensorflow/core/grappler/utils/topological_sort.cc
index 63ca92c69e1c11a90e7870f1509228d90239fa72..a6d0f5037bb35cbbb909cbb4049153f0d1013c64 100644
--- a/tensorflow/core/grappler/utils/topological_sort.cc
+++ b/tensorflow/core/grappler/utils/topological_sort.cc
@@ -14,10 +14,15 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/utils/topological_sort.h"
+
 #include <algorithm>
 #include <deque>
 #include <unordered_map>
+
+#include "absl/types/span.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/graph_topology_view.h"
+#include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -25,27 +30,46 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+namespace {
+
+std::vector<GraphView::Edge> MakeEphemeralEdges(
+    const absl::Span<const TopologicalDependency> extra_dependencies) {
+  std::vector<GraphView::Edge> ephemeral_edges;
+  ephemeral_edges.reserve(extra_dependencies.size());
+  for (const auto& dep : extra_dependencies) {
+    ephemeral_edges.emplace_back(
+        GraphView::OutputPort(dep.from, Graph::kControlSlot),
+        GraphView::InputPort(dep.to, Graph::kControlSlot));
+  }
+  return ephemeral_edges;
+}
+
 // Kahn's algorithm is implemented.
 // For details, see https://en.wikipedia.org/wiki/Topological_sorting
 Status ComputeTopologicalOrder(
-    const GraphDef& graph, std::vector<int>* ready_nodes,
-    const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
-        extra_dependencies) {
-  SimpleGraphView graph_view;
-  TF_RETURN_IF_ERROR(graph_view.Initialize(graph, extra_dependencies));
+    const GraphDef& graph,
+    const absl::Span<const TopologicalDependency> extra_dependencies,
+    std::vector<int>* ready_nodes) {
+  GraphTopologyView graph_view;
+  TF_RETURN_IF_ERROR(graph_view.InitializeFromGraph(
+      graph, MakeEphemeralEdges(extra_dependencies)));
+
+  // Keep track of how many inputs are ready for the given node.
+  std::vector<int> num_ready_inputs(graph.node_size(), 0);
 
-  ready_nodes->reserve(graph_view.num_nodes());
+  // We'll push index of ready nodes to this output vector.
+  ready_nodes->reserve(graph.node_size());
 
   int front = 0;
   int back = 0;
-  std::vector<int> num_ready_inputs(graph_view.num_nodes(), 0);
-  for (int i = 0; i < graph_view.num_nodes(); i++) {
-    if (graph_view.inputs(i).empty()) {
+
+  for (int i = 0; i < graph.node_size(); i++) {
+    if (graph_view.GetFanin(i).empty()) {
       ready_nodes->push_back(i);
       back++;
     }
     if (IsMerge(graph.node(i))) {
-      for (int input : graph_view.inputs(i)) {
+      for (int input : graph_view.GetFanin(i)) {
         if (IsNextIteration(graph.node(input))) {
           num_ready_inputs[i]++;
         }
@@ -55,9 +79,9 @@ Status ComputeTopologicalOrder(
 
   while (front != back) {
     int ready_node = (*ready_nodes)[front];
-    for (int fanout : graph_view.outputs(ready_node)) {
+    for (int fanout : graph_view.GetFanout(ready_node)) {
       ++num_ready_inputs[fanout];
-      if (num_ready_inputs[fanout] == graph_view.inputs(fanout).size()) {
+      if (num_ready_inputs[fanout] == graph_view.GetFanin(fanout).size()) {
         ready_nodes->push_back(fanout);
         ++back;
       }
@@ -72,23 +96,32 @@ Status ComputeTopologicalOrder(
   return Status::OK();
 }
 
+}  // namespace
+
 Status ComputeTopologicalOrder(
-    const GraphDef& graph, std::unordered_map<const NodeDef*, int>* topo_order,
-    const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
-        extra_dependencies) {
+    const GraphDef& graph,
+    const absl::Span<const TopologicalDependency> extra_dependencies,
+    std::vector<const NodeDef*>* topo_order) {
   std::vector<int> ready_nodes;
   TF_RETURN_IF_ERROR(
-      ComputeTopologicalOrder(graph, &ready_nodes, extra_dependencies));
-  topo_order->reserve(graph.node_size());
-  for (int i = 0; i < ready_nodes.size(); ++i) {
-    (*topo_order)[&graph.node(ready_nodes[i])] = i;
+      ComputeTopologicalOrder(graph, extra_dependencies, &ready_nodes));
+
+  topo_order->reserve(ready_nodes.size());
+  for (int ready_node_idx : ready_nodes) {
+    topo_order->emplace_back(&graph.node(ready_node_idx));
   }
+
   return Status::OK();
 }
 
+Status ComputeTopologicalOrder(const GraphDef& graph,
+                               std::vector<const NodeDef*>* topo_order) {
+  return ComputeTopologicalOrder(graph, {}, topo_order);
+}
+
 Status ReversedTopologicalSort(GraphDef* graph) {
   std::vector<int> ready_nodes;
-  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(*graph, &ready_nodes, nullptr));
+  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(*graph, {}, &ready_nodes));
   std::reverse(ready_nodes.begin(), ready_nodes.end());
   PermuteNodesInPlace(graph, &ready_nodes, /*invert_permutation=*/true);
   return Status::OK();
@@ -96,7 +129,7 @@ Status ReversedTopologicalSort(GraphDef* graph) {
 
 Status TopologicalSort(GraphDef* graph) {
   std::vector<int> ready_nodes;
-  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(*graph, &ready_nodes, nullptr));
+  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(*graph, {}, &ready_nodes));
   PermuteNodesInPlace(graph, &ready_nodes, /*invert_permutation=*/true);
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/utils/topological_sort.h b/tensorflow/core/grappler/utils/topological_sort.h
index b8cf897a321877bc73946907aa11b8b2c20255e9..dd4208dfff3b28f2b55f71e0cf369b655d6f8c09 100644
--- a/tensorflow/core/grappler/utils/topological_sort.h
+++ b/tensorflow/core/grappler/utils/topological_sort.h
@@ -16,22 +16,40 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
 #define TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
 
+#include "absl/types/span.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace grappler {
 
-// Compute a topological ordering for the graph nodes.
+// TODO(ezhulenev, b/121379902): We should be consistent with GraphTopologyView
+// and use `GraphView::Edge` to pass extra dependencies.
+struct TopologicalDependency {
+  TopologicalDependency(const NodeDef* from, const NodeDef* to)
+      : from(from), to(to) {}
+  const NodeDef* from;
+  const NodeDef* to;
+};
+
+// Computes a topological ordering for the graph nodes and outputs nodes in the
+// topological order to the `topo_order` output argument.
+//
+// It's possible to pass additional edges that do not exists in a graph, but
+// must be respected when computing graph topological order. Example: Tensorflow
+// runtime allows concurrent execution of dequeue/enqueue ops from the same
+// queue resource, but we might want to enforce ordering between them.
 Status ComputeTopologicalOrder(
-    const GraphDef& graph, std::unordered_map<const NodeDef*, int>* topo_order,
-    const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
-        extra_dependencies);
+    const GraphDef& graph,
+    absl::Span<const TopologicalDependency> extra_dependencies,
+    std::vector<const NodeDef*>* topo_order);
+Status ComputeTopologicalOrder(const GraphDef& graph,
+                               std::vector<const NodeDef*>* topo_order);
 
-// Sort a graph in topological order.
+// Sorts a graph in topological order.
 Status TopologicalSort(GraphDef* graph);
 
-// Sort a graph in topological order and reverse it.
+// Sorts a graph in topological order and reverse it.
 Status ReversedTopologicalSort(GraphDef* graph);
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/utils/topological_sort_test.cc b/tensorflow/core/grappler/utils/topological_sort_test.cc
index 48b7eb50bd9f2a4867e68291588d2e5c11a0c5c2..3868183c62d0dbdb09a65996b9de79b7a6001ca3 100644
--- a/tensorflow/core/grappler/utils/topological_sort_test.cc
+++ b/tensorflow/core/grappler/utils/topological_sort_test.cc
@@ -14,79 +14,94 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
 namespace grappler {
-namespace {
 
 class TopologicalSortTest : public ::testing::Test {
  protected:
-  static NodeDef CreateNode(const string& name,
-                            const std::vector<string>& inputs) {
-    return CreateNode(name, "", inputs);
-  }
-  static NodeDef CreateNode(const string& name, const string& op,
-                            const std::vector<string>& inputs) {
-    NodeDef node;
-    node.set_name(name);
-    if (!op.empty()) {
-      node.set_op(op);
+  struct NodeConfig {
+    NodeConfig(string name, std::vector<string> inputs)
+        : name(std::move(name)), inputs(std::move(inputs)) {}
+    NodeConfig(string name, string op, std::vector<string> inputs)
+        : name(std::move(name)), op(std::move(op)), inputs(std::move(inputs)) {}
+
+    string name;
+    string op;
+    std::vector<string> inputs;
+  };
+
+  static GraphDef CreateGraph(const std::vector<NodeConfig>& nodes) {
+    GraphDef graph;
+
+    for (const NodeConfig& node : nodes) {
+      NodeDef node_def;
+      node_def.set_name(node.name);
+      node_def.set_op(node.op);
+      for (const string& input : node.inputs) {
+        node_def.add_input(input);
+      }
+      *graph.add_node() = std::move(node_def);
     }
-    for (const string& input : inputs) {
-      node.add_input(input);
-    }
-    return node;
+
+    return graph;
   }
 };
 
 TEST_F(TopologicalSortTest, NoLoop) {
-  GraphDef graph;
-  *graph.add_node() = CreateNode("2", {"5"});
-  *graph.add_node() = CreateNode("0", {"5", "4"});
-  *graph.add_node() = CreateNode("1", {"4", "3"});
-  *graph.add_node() = CreateNode("3", {"2"});
-  *graph.add_node() = CreateNode("5", {});
-  *graph.add_node() = CreateNode("4", {});
-
-  std::unordered_map<const NodeDef*, int> topo_order;
-  TF_EXPECT_OK(ComputeTopologicalOrder(graph, &topo_order, nullptr));
+  GraphDef graph = CreateGraph({
+      {"2", {"5"}},       //
+      {"0", {"5", "4"}},  //
+      {"1", {"4", "3"}},  //
+      {"3", {"2"}},       //
+      {"5", {}},          //
+      {"4", {}}           //
+  });
+
+  std::vector<const NodeDef*> topo_order;
+  TF_EXPECT_OK(ComputeTopologicalOrder(graph, &topo_order));
 
   const std::vector<string> order = {"5", "4", "2", "0", "3", "1"};
-  for (const auto& topo : topo_order) {
-    const string& node_name = topo.first->name();
-    const int topo_order = topo.second;
-    std::cout << "Node " << node_name << " at order " << topo_order
-              << std::endl;
-    EXPECT_EQ(node_name, order[topo_order]);
+
+  ASSERT_EQ(topo_order.size(), order.size());
+  for (int i = 0; i < topo_order.size(); ++i) {
+    const NodeDef* node = topo_order[i];
+    EXPECT_EQ(node->name(), order[i]);
   }
 
   TF_EXPECT_OK(TopologicalSort(&graph));
-  for (int i = 0; i < order.size(); i++) {
+  for (int i = 0; i < topo_order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
   }
 }
 
 TEST_F(TopologicalSortTest, WithLoop) {
-  GraphDef graph;
-  // Create a loop
-  *graph.add_node() = CreateNode("2", "Merge", {"1", "5"});
-  *graph.add_node() = CreateNode("3", "Switch", {"2"});
-  *graph.add_node() = CreateNode("4", "Identity", {"3"});
-  *graph.add_node() = CreateNode("5", "NextIteration", {"4"});
-  *graph.add_node() = CreateNode("1", {});
-
-  std::unordered_map<const NodeDef*, int> topo_order;
-  TF_EXPECT_OK(ComputeTopologicalOrder(graph, &topo_order, nullptr));
+  GraphDef graph = CreateGraph({
+      // Graph with a loop.
+      {"2", "Merge", {"1", "5"}},     //
+      {"3", "Switch", {"2"}},         //
+      {"4", "Identity", {"3"}},       //
+      {"5", "NextIteration", {"4"}},  //
+      {"1", {}}                       //
+  });
+
+  std::vector<const NodeDef*> topo_order;
+  TF_EXPECT_OK(ComputeTopologicalOrder(graph, &topo_order));
 
   const std::vector<string> order = {"1", "2", "3", "4", "5"};
-  for (const auto& topo : topo_order) {
-    const string& node_name = topo.first->name();
-    const int topo_order = topo.second;
-    EXPECT_EQ(node_name, order[topo_order]);
+
+  ASSERT_EQ(topo_order.size(), order.size());
+  for (int i = 0; i < topo_order.size(); ++i) {
+    const NodeDef* node = topo_order[i];
+    EXPECT_EQ(node->name(), order[i]);
   }
 
   TF_EXPECT_OK(TopologicalSort(&graph));
@@ -96,12 +111,13 @@ TEST_F(TopologicalSortTest, WithLoop) {
 }
 
 TEST_F(TopologicalSortTest, WithIllegalLoop) {
-  GraphDef graph;
   // A loop without Merge and NextIteration is illegal and the original node
   // order and graph will be preserved.
-  *graph.add_node() = CreateNode("2", {"1", "3"});
-  *graph.add_node() = CreateNode("3", {"2"});
-  *graph.add_node() = CreateNode("1", {});
+  GraphDef graph = CreateGraph({
+      {"2", {"1", "3"}},  //
+      {"3", {"2"}},       //
+      {"1", {}}           //
+  });
 
   EXPECT_FALSE(TopologicalSort(&graph).ok());
   std::vector<string> order = {"2", "3", "1"};
@@ -111,9 +127,10 @@ TEST_F(TopologicalSortTest, WithIllegalLoop) {
 }
 
 TEST_F(TopologicalSortTest, DuplicatedInputs) {
-  GraphDef graph;
-  *graph.add_node() = CreateNode("2", {"1", "1"});
-  *graph.add_node() = CreateNode("1", {});
+  GraphDef graph = CreateGraph({
+      {"2", {"1", "1"}},  //
+      {"1", {}}           //
+  });
 
   TF_EXPECT_OK(TopologicalSort(&graph));
   std::vector<string> order = {"1", "2"};
@@ -123,12 +140,13 @@ TEST_F(TopologicalSortTest, DuplicatedInputs) {
 }
 
 TEST_F(TopologicalSortTest, Idempotent) {
-  GraphDef graph;
-  *graph.add_node() = CreateNode("1", {});
-  *graph.add_node() = CreateNode("2", {});
-  *graph.add_node() = CreateNode("3", {"1", "2"});
-  *graph.add_node() = CreateNode("4", {"1", "3"});
-  *graph.add_node() = CreateNode("5", {"2", "3"});
+  GraphDef graph = CreateGraph({
+      {"1", {}},          //
+      {"2", {}},          //
+      {"3", {"1", "2"}},  //
+      {"4", {"1", "3"}},  //
+      {"5", {"2", "3"}}   //
+  });
 
   TF_EXPECT_OK(TopologicalSort(&graph));
   std::vector<string> order = {"1", "2", "3", "4", "5"};
@@ -136,7 +154,7 @@ TEST_F(TopologicalSortTest, Idempotent) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
   }
 
-  // Run topo sort again to verify that it is idenpotent.
+  // Run topo sort again to verify that it is idempotent.
   TF_EXPECT_OK(TopologicalSort(&graph));
   for (int i = 0; i < order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
@@ -144,35 +162,81 @@ TEST_F(TopologicalSortTest, Idempotent) {
 }
 
 TEST_F(TopologicalSortTest, ExtraDependencies) {
-  GraphDef graph;
-  *graph.add_node() = CreateNode("2", {"5"});
-  *graph.add_node() = CreateNode("0", {"5", "4"});
-  *graph.add_node() = CreateNode("1", {"4", "3"});
-  *graph.add_node() = CreateNode("3", {"2"});
-  *graph.add_node() = CreateNode("5", {});
-  *graph.add_node() = CreateNode("4", {});
+  GraphDef graph = CreateGraph({
+      {"2", {"5"}},       //
+      {"0", {"5", "4"}},  //
+      {"1", {"4", "3"}},  //
+      {"3", {"2"}},       //
+      {"5", {}},          //
+      {"4", {}}           //
+  });
 
   // Add an edge from 4 to 5.
-  std::vector<std::pair<const NodeDef*, const NodeDef*>> extra_dependencies;
-  extra_dependencies.emplace_back(&graph.node(5), &graph.node(4));
-
-  std::unordered_map<const NodeDef*, int> topo_order;
-  TF_EXPECT_OK(
-      ComputeTopologicalOrder(graph, &topo_order, &extra_dependencies));
-
-  const std::vector<string> order = {"4", "5", "2", "0", "3", "1"};
-  for (const auto& topo : topo_order) {
-    const string& node_name = topo.first->name();
-    const int topo_order = topo.second;
-    EXPECT_EQ(node_name, order[topo_order]);
+  std::vector<TopologicalDependency> extra_dependencies;
+  extra_dependencies.push_back({&graph.node(5), &graph.node(4)});
+
+  std::vector<const NodeDef*> topo_order;
+  TF_EXPECT_OK(ComputeTopologicalOrder(graph, extra_dependencies, &topo_order));
+
+  const std::vector<string> valid_order_1 = {"4", "5", "2", "0", "3", "1"};
+  const std::vector<string> valid_order_2 = {"4", "5", "0", "2", "3", "1"};
+
+  ASSERT_EQ(topo_order.size(), valid_order_1.size());
+
+  std::vector<string> computed_order(6, "");
+  for (int i = 0; i < topo_order.size(); ++i) {
+    const NodeDef* node = topo_order[i];
+    computed_order[i] = node->name();
   }
+  EXPECT_TRUE(computed_order == valid_order_1 ||
+              computed_order == valid_order_2);
 
-  // Add an edge from 0 to 4. This will create a loop
-  extra_dependencies.emplace_back(&graph.node(1), &graph.node(5));
+  // Add an edge from `0` to `4`. This will create a loop.
+  extra_dependencies.push_back({&graph.node(1), &graph.node(5)});
   EXPECT_FALSE(
-      ComputeTopologicalOrder(graph, &topo_order, &extra_dependencies).ok());
+      ComputeTopologicalOrder(graph, extra_dependencies, &topo_order).ok());
+}
+
+static void BM_ComputeTopologicalOrder(int iters, int size) {
+  testing::StopTiming();
+
+  random::PhiloxRandom philox(0x12345);
+  random::SimplePhilox rnd(&philox);
+
+  string prefix = "long_node_name_prefix_to_measure_string_copy_overhead";
+
+  GraphDef graph;
+  for (int i = 0; i < size; ++i) {
+    const string name = absl::StrCat(prefix, i);
+    const uint32 num_inputs = rnd.Uniform(std::min(i, 5));
+
+    NodeDef node;
+    node.set_name(name);
+    for (int n = 0; n < num_inputs; ++n) {
+      const uint32 input_node = rnd.Uniform(i);
+      node.add_input(absl::StrCat(prefix, input_node));
+    }
+
+    *graph.add_node() = std::move(node);
+  }
+
+  testing::StartTiming();
+  std::vector<const NodeDef*> topo_order;
+  for (int i = 0; i < iters; i++) {
+    topo_order.clear();
+    Status st = ComputeTopologicalOrder(graph, &topo_order);
+    CHECK(st.ok()) << "Failed to compute topological order";
+  }
+  testing::StopTiming();
 }
+BENCHMARK(BM_ComputeTopologicalOrder)
+    ->Arg(10)
+    ->Arg(100)
+    ->Arg(1000)
+    ->Arg(10000)
+    ->Arg(25000)
+    ->Arg(50000)
+    ->Arg(100000);
 
-}  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/traversal.cc b/tensorflow/core/grappler/utils/traversal.cc
index 6952277568676baf5812a20c4c743356eeedd40a..c602e8c0e47723b4e6ad68431e5b08b8314d1c95 100644
--- a/tensorflow/core/grappler/utils/traversal.cc
+++ b/tensorflow/core/grappler/utils/traversal.cc
@@ -17,89 +17,109 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/graph_topology_view.h"
 
 namespace tensorflow {
 namespace grappler {
 
 namespace {
 
-template <typename GraphViewType>
-void ReverseDfsInternal(
-    const GraphViewType& graph_view, const std::vector<const NodeDef*>& from,
-    const std::function<void(const NodeDef*)>& pre_order,
-    const std::function<void(const NodeDef*)>& post_order,
-    const std::function<void(const NodeDef*, const NodeDef*)>& on_back_edge) {
-  // Stack of work to do.
-  struct StackElem {
-    const NodeDef* node;
-    bool children_visited;
-    const NodeDef* src;
-  };
-  std::vector<StackElem> stack;
+struct DfsStackElem {
+  DfsStackElem(int node, bool children_visited, int src)
+      : node(node), children_visited(children_visited), src(src) {}
+  explicit DfsStackElem(int node) : DfsStackElem(node, false, -1) {}
 
+  // Index of the node in the graph ∊ [0, num_nodes).
+  int node;
+  // `True` if visited all the input/output nodes (pushed all input/output nodes
+  // to the stack).
+  bool children_visited;
+  // Index of the node in the graph, from which we entered the `node`.
+  int src;
+};
+
+enum class NodeState { kNotVisited, kVisiting, kDone };
+
+}  // namespace
+
+void DfsTraversal(const GraphTopologyView& graph_view,
+                  const absl::Span<const NodeDef* const> from,
+                  const TraversalDirection direction,
+                  const DfsPredicates& predicates,
+                  const DfsCallbacks& callbacks) {
+  std::vector<DfsStackElem> stack;
   stack.reserve(from.size());
+
   for (const NodeDef* node : from) {
-    stack.push_back(StackElem{node, false});
+    const absl::optional<int> node_idx = graph_view.GetNodeIndex(*node);
+    DCHECK(node_idx.has_value()) << "Illegal start node: " << node->name();
+    if (node_idx.has_value()) {
+      stack.emplace_back(node_idx.value());
+    }
   }
 
-  enum NodeState { NOT_VISITED = 0, VISITING = 1, DONE = 2 };
-  absl::flat_hash_map<const NodeDef*, NodeState> node_state;
+  absl::flat_hash_map<int, NodeState> node_state;
   while (!stack.empty()) {
-    StackElem w = stack.back();
+    DfsStackElem w = stack.back();
     stack.pop_back();
 
+    NodeState& state = node_state[w.node];
+    if (state == NodeState::kDone) continue;
+
+    // Skip nodes that we should not enter.
+    if (predicates.enter && !predicates.enter(graph_view.GetNode(w.node))) {
+      state = NodeState::kDone;
+      continue;
+    }
+
+    // We've processed all the children of this node.
     if (w.children_visited) {
-      // We've processed all the children of this node
-      node_state[w.node] = DONE;
-      if (post_order) {
-        post_order(w.node);
+      state = NodeState::kDone;
+      if (callbacks.post_order) {
+        callbacks.post_order(graph_view.GetNode(w.node));
       }
       continue;
     }
 
-    auto& rslt = node_state[w.node];
-    if (rslt == DONE) {
-      continue;
-    } else if (rslt == VISITING) {
-      // Loop detected
-      if (on_back_edge) {
-        on_back_edge(w.src, w.node);
+    // Loop detected.
+    if (state == NodeState::kVisiting) {
+      if (callbacks.on_back_edge) {
+        callbacks.on_back_edge(graph_view.GetNode(w.src),
+                               graph_view.GetNode(w.node));
       }
       continue;
     }
-    rslt = VISITING;
-    if (pre_order) {
-      pre_order(w.node);
+
+    state = NodeState::kVisiting;
+    if (callbacks.pre_order) {
+      callbacks.pre_order(graph_view.GetNode(w.node));
     }
 
     // Enqueue the node again with the children_visited flag set to true.
-    stack.push_back(StackElem{w.node, true, w.src});
+    stack.emplace_back(w.node, true, w.src);
 
-    // Now enqueue the node children.
-    for (const auto fanin : graph_view.GetFanins(*w.node, true)) {
-      stack.push_back(StackElem{fanin.node, false, w.node});
+    // Check if we can continue traversal from the current node.
+    if (predicates.advance && !predicates.advance(graph_view.GetNode(w.node))) {
+      continue;
     }
-  }
-}
-
-}  // namespace
 
-void ReverseDfs(
-    const GraphView& graph_view, const std::vector<const NodeDef*>& from,
-    const std::function<void(const NodeDef*)>& pre_order,
-    const std::function<void(const NodeDef*)>& post_order,
-    const std::function<void(const NodeDef*, const NodeDef*)>& on_back_edge) {
-  ReverseDfsInternal<GraphView>(graph_view, from, pre_order, post_order,
-                                on_back_edge);
+    // Now enqueue the fanin/fanout nodes.
+    if (direction == TraversalDirection::kFollowInputs) {
+      for (const int fanin : graph_view.GetFanin(w.node)) {
+        stack.emplace_back(fanin, false, w.node);
+      }
+    } else {
+      for (const int fanout : graph_view.GetFanout(w.node)) {
+        stack.emplace_back(fanout, false, w.node);
+      }
+    }
+  }
 }
 
-void ReverseDfs(
-    const MutableGraphView& graph_view, const std::vector<const NodeDef*>& from,
-    const std::function<void(const NodeDef*)>& pre_order,
-    const std::function<void(const NodeDef*)>& post_order,
-    const std::function<void(const NodeDef*, const NodeDef*)>& on_back_edge) {
-  ReverseDfsInternal<MutableGraphView>(graph_view, from, pre_order, post_order,
-                                       on_back_edge);
+void DfsTraversal(const GraphTopologyView& graph_view,
+                  const absl::Span<const NodeDef* const> from,
+                  TraversalDirection direction, const DfsCallbacks& callbacks) {
+  DfsTraversal(graph_view, from, direction, {}, callbacks);
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/utils/traversal.h b/tensorflow/core/grappler/utils/traversal.h
index 5b7737f97eb1f8ee56efd599d6216dc4e472febd..5c9dada4933ff803c9f53fec44f74104daec11f6 100644
--- a/tensorflow/core/grappler/utils/traversal.h
+++ b/tensorflow/core/grappler/utils/traversal.h
@@ -17,29 +17,85 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPPLER_UTILS_TRAVERSAL_H_
 
 #include <functional>
-#include "tensorflow/core/grappler/graph_view.h"
-#include "tensorflow/core/grappler/mutable_graph_view.h"
+
+#include "tensorflow/core/grappler/graph_topology_view.h"
 
 namespace tensorflow {
 namespace grappler {
 
-// Traverse the graph in reverse dfs order, starting from the list of nodes
-// specified in the 'from' argument. The pre_order and post_order functors will
-// be called on each reachable node (including the 'from' nodes) in pre and post
-// order. If loops are found, the on_back_edge functor will be called on the
+enum class TraversalDirection { kFollowInputs, kFollowOutputs };
+
+// Encapsulate DFS callbacks that will be called during the graph traversal.
+//
+// If non-empty, the `pre_order` and `post_order` functors will be called on
+// each reachable node (including the `from` nodes) in pre and post order. If
+// loops are found, the `on_back_edge` functor will be called on the
 // corresponding back edges. Moreover, the pre and post order will assume that
 // these back edges will be cut.
-void ReverseDfs(
-    const GraphView& graph_view, const std::vector<const NodeDef*>& from,
-    const std::function<void(const NodeDef*)>& pre_order,
-    const std::function<void(const NodeDef*)>& post_order,
-    const std::function<void(const NodeDef*, const NodeDef*)>& on_back_edge);
-
-void ReverseDfs(
-    const MutableGraphView& graph_view, const std::vector<const NodeDef*>& from,
-    const std::function<void(const NodeDef*)>& pre_order,
-    const std::function<void(const NodeDef*)>& post_order,
-    const std::function<void(const NodeDef*, const NodeDef*)>& on_back_edge);
+struct DfsCallbacks {
+  DfsCallbacks() = default;
+  DfsCallbacks(std::function<void(const NodeDef*)> pre,
+               std::function<void(const NodeDef*)> post,
+               std::function<void(const NodeDef*, const NodeDef*)> back_edge)
+      : pre_order(std::move(pre)),
+        post_order(std::move(post)),
+        on_back_edge(std::move(back_edge)) {}
+
+  static DfsCallbacks PreOrder(std::function<void(const NodeDef*)> pre) {
+    return DfsCallbacks(std::move(pre), nullptr, nullptr);
+  }
+
+  static DfsCallbacks PostOrder(std::function<void(const NodeDef*)> post) {
+    return DfsCallbacks(nullptr, std::move(post), nullptr);
+  }
+
+  std::function<void(const NodeDef*)> pre_order;
+  std::function<void(const NodeDef*)> post_order;
+  std::function<void(const NodeDef*, const NodeDef*)> on_back_edge;
+};
+
+// Encapsulate DFS predicates for traversing the graph.
+//
+// The `enter` predicate decides if traversal should enter the node, and the
+// `advance` predicate decides if the traversal should follow inputs/outputs
+// from the node.
+//
+// If predicates are empty (default initialized), it's assumed that we can enter
+// into any node and advance from any node respectively.
+struct DfsPredicates {
+  DfsPredicates() = default;
+  DfsPredicates(std::function<bool(const NodeDef*)> enter,
+                std::function<bool(const NodeDef*)> advance)
+      : enter(std::move(enter)), advance(std::move(advance)) {}
+
+  static DfsPredicates Enter(std::function<bool(const NodeDef*)> enter) {
+    return DfsPredicates(std::move(enter), nullptr);
+  }
+
+  static DfsPredicates Advance(std::function<bool(const NodeDef*)> advance) {
+    return DfsPredicates(nullptr, std::move(advance));
+  }
+
+  std::function<bool(const NodeDef*)> enter;
+  std::function<bool(const NodeDef*)> advance;
+};
+
+// Traverse the graph in DFS order in the given direction, starting from the
+// list of nodes specified in the `from` argument. Use `predicates` to decide if
+// traversal should enter/advance to/from the graph node. These predicates also
+// applied to the `from` nodes. Call corresponding callbacks for each visited
+// node.
+void DfsTraversal(const GraphTopologyView& graph_view,
+                  absl::Span<const NodeDef* const> from,
+                  TraversalDirection direction, const DfsPredicates& predicates,
+                  const DfsCallbacks& callbacks);
+
+// Traverse the graph in DFS order in the given direction, starting from the
+// list of nodes specified in the `from` argument. Call corresponding callbacks
+// for each visited node.
+void DfsTraversal(const GraphTopologyView& graph_view,
+                  absl::Span<const NodeDef* const> from,
+                  TraversalDirection direction, const DfsCallbacks& callbacks);
 
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/traversal_test.cc b/tensorflow/core/grappler/utils/traversal_test.cc
index c040477a08970436cb07f6bb87c30e47b6b72525..7b36d328e938473333bd79044b7e953a2f25e17c 100644
--- a/tensorflow/core/grappler/utils/traversal_test.cc
+++ b/tensorflow/core/grappler/utils/traversal_test.cc
@@ -15,101 +15,222 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/utils/traversal.h"
 
-#include "tensorflow/core/lib/strings/strcat.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace grappler {
+
 namespace {
+using ::tensorflow::test::function::NDef;
+
+DfsCallbacks MkCallbacks(std::vector<string>* pre_order,
+                         std::vector<string>* post_order,
+                         std::vector<string>* back_edges) {
+  return {[pre_order](const NodeDef* n) { pre_order->push_back(n->name()); },
+          [post_order](const NodeDef* n) { post_order->push_back(n->name()); },
+          [back_edges](const NodeDef* src, const NodeDef* dst) {
+            back_edges->push_back(absl::StrCat(src->name(), "->", dst->name()));
+          }};
+}
+
+TEST(TraversalTest, OutputsDfsNoLoop) {
+  const string op = "OpIsNotImportantInThisTest";
+
+  GraphDef graph = ::tensorflow::test::function::GDef(  //
+      {NDef("2", op, {"5"}, {}),                        //
+       NDef("0", op, {"5", "4"}, {}),                   //
+       NDef("1", op, {"4", "3"}, {}),                   //
+       NDef("3", op, {"2"}, {}),                        //
+       NDef("5", op, {}, {}),                           //
+       NDef("4", op, {}, {})},                          //
+      /*funcs=*/{});
 
-class TraversalTest : public ::testing::Test {
- protected:
-  static NodeDef CreateNode(const string& name,
-                            const std::vector<string>& inputs) {
-    return CreateNode(name, "", inputs);
-  }
-  static NodeDef CreateNode(const string& name, const string& op,
-                            const std::vector<string>& inputs) {
-    NodeDef node;
-    node.set_name(name);
-    if (!op.empty()) {
-      node.set_op(op);
-    }
-    for (const string& input : inputs) {
-      node.add_input(input);
-    }
-    return node;
-  }
-};
-
-TEST_F(TraversalTest, ReverseDfsNoLoop) {
-  GraphDef graph;
-  *graph.add_node() = CreateNode("2", {"5"});
-  *graph.add_node() = CreateNode("0", {"5", "4"});
-  *graph.add_node() = CreateNode("1", {"4", "3"});
-  *graph.add_node() = CreateNode("3", {"2"});
-  *graph.add_node() = CreateNode("5", {});
-  *graph.add_node() = CreateNode("4", {});
+  std::vector<const NodeDef*> start_nodes = {&graph.node(4), &graph.node(5)};
+
+  std::vector<string> pre_order;
+  std::vector<string> post_order;
+  std::vector<string> back_edges;
+
+  GraphTopologyView graph_view;
+  TF_CHECK_OK(graph_view.InitializeFromGraph(graph));
+  DfsTraversal(graph_view, start_nodes, TraversalDirection::kFollowOutputs,
+               MkCallbacks(&pre_order, &post_order, &back_edges));
+
+  const std::vector<string> expected_pre = {"4", "1", "0", "5", "2", "3"};
+  const std::vector<string> expected_post = {"1", "0", "4", "3", "2", "5"};
+
+  EXPECT_EQ(pre_order, expected_pre);
+  EXPECT_EQ(post_order, expected_post);
+  EXPECT_TRUE(back_edges.empty());
+}
+
+TEST(TraversalTest, InputsDfsNoLoop) {
+  const string op = "OpIsNotImportantInThisTest";
+
+  GraphDef graph = ::tensorflow::test::function::GDef(  //
+      {NDef("2", op, {"5"}, {}),                        //
+       NDef("0", op, {"5", "4"}, {}),                   //
+       NDef("1", op, {"4", "3"}, {}),                   //
+       NDef("3", op, {"2"}, {}),                        //
+       NDef("5", op, {}, {}),                           //
+       NDef("4", op, {}, {})},                          //
+      /*funcs=*/{});
 
   std::vector<const NodeDef*> start_nodes = {&graph.node(1), &graph.node(2)};
+
   std::vector<string> pre_order;
   std::vector<string> post_order;
-  bool found_back_edge = false;
-  ReverseDfs(
-      GraphView(&graph), start_nodes,
-      [&pre_order](const NodeDef* n) { pre_order.push_back(n->name()); },
-      [&post_order](const NodeDef* n) { post_order.push_back(n->name()); },
-      [&found_back_edge](const NodeDef*, const NodeDef*) {
-        found_back_edge = true;
-      });
-
-  // Pre/Post order traversals are non deterministic because a node fanin is an
-  // absl::flat_hash_set with non deterministic traversal order.
-  using ValidTraversal = std::pair<std::vector<string>, std::vector<string>>;
-
-  std::set<ValidTraversal> valid_traversals = {
-      // pre_order                     post_order
-      {{"1", "4", "3", "2", "5", "0"}, {"4", "5", "2", "3", "1", "0"}},
-      {{"1", "3", "2", "5", "4", "0"}, {"5", "2", "3", "4", "1", "0"}}};
-
-  EXPECT_EQ(valid_traversals.count({pre_order, post_order}), 1);
-  EXPECT_FALSE(found_back_edge);
+  std::vector<string> back_edges;
+
+  GraphTopologyView graph_view;
+  TF_CHECK_OK(graph_view.InitializeFromGraph(graph));
+  DfsTraversal(graph_view, start_nodes, TraversalDirection::kFollowInputs,
+               MkCallbacks(&pre_order, &post_order, &back_edges));
+
+  const std::vector<string> expected_pre = {"1", "4", "3", "2", "5", "0"};
+  const std::vector<string> expected_post = {"4", "5", "2", "3", "1", "0"};
+
+  EXPECT_EQ(pre_order, expected_pre);
+  EXPECT_EQ(post_order, expected_post);
+  EXPECT_TRUE(back_edges.empty());
 }
 
-TEST_F(TraversalTest, ReverseDfsWithLoop) {
-  GraphDef graph;
-  // Create a loop
-  *graph.add_node() = CreateNode("2", "Merge", {"1", "5"});
-  *graph.add_node() = CreateNode("3", "Switch", {"2"});
-  *graph.add_node() = CreateNode("4", "Identity", {"3"});
-  *graph.add_node() = CreateNode("5", "NextIteration", {"4"});
-  *graph.add_node() = CreateNode("1", "Enter", {});
-  *graph.add_node() = CreateNode("6", "Exit", {"3"});
+TEST(TraversalTest, InputsDfsWithLoop) {
+  // Graph with a loop.
+  GraphDef graph = ::tensorflow::test::function::GDef(  //
+      {NDef("2", "Merge", {"1", "5"}, {}),              //
+       NDef("3", "Switch", {"2"}, {}),                  //
+       NDef("4", "Identity", {"3"}, {}),                //
+       NDef("5", "NextIteration", {"4"}, {}),           //
+       NDef("1", "Enter", {}, {}),                      //
+       NDef("6", "Exit", {"3"}, {})},                   //
+      /*funcs=*/{});
 
   std::vector<const NodeDef*> start_nodes = {&graph.node(5)};
+
   std::vector<string> pre_order;
   std::vector<string> post_order;
   std::vector<string> back_edges;
-  ReverseDfs(
-      GraphView(&graph), start_nodes,
-      [&pre_order](const NodeDef* n) { pre_order.push_back(n->name()); },
-      [&post_order](const NodeDef* n) { post_order.push_back(n->name()); },
-      [&back_edges](const NodeDef* src, const NodeDef* dst) {
-        back_edges.push_back(strings::StrCat(src->name(), "->", dst->name()));
-      });
-
-  // Pre/Post order traversals are non deterministic because a node fanin is an
-  // absl::flat_hash_set with non deterministic traversal order.
-  using ValidTraversal = std::pair<std::vector<string>, std::vector<string>>;
-
-  std::set<ValidTraversal> valid_traversals = {
-      // pre_order                     post_order
-      {{"6", "3", "2", "4", "5", "1"}, {"5", "4", "1", "2", "3", "6"}},
-      {{"6", "3", "2", "1", "5", "4"}, {"1", "4", "5", "2", "3", "6"}},
-      {{"6", "3", "2", "5", "4", "1"}, {"4", "5", "1", "2", "3", "6"}}};
-
-  EXPECT_EQ(valid_traversals.count({pre_order, post_order}), 1);
-  EXPECT_EQ(std::vector<string>({"4->3"}), back_edges);
+
+  GraphTopologyView graph_view;
+  TF_CHECK_OK(graph_view.InitializeFromGraph(graph));
+  DfsTraversal(graph_view, start_nodes, TraversalDirection::kFollowInputs,
+               MkCallbacks(&pre_order, &post_order, &back_edges));
+
+  const std::vector<string> expected_pre = {"6", "3", "2", "1", "5", "4"};
+  const std::vector<string> expected_post = {"1", "4", "5", "2", "3", "6"};
+  const std::vector<string> expected_edges = {"4->3"};
+
+  EXPECT_EQ(pre_order, expected_pre);
+  EXPECT_EQ(post_order, expected_post);
+  EXPECT_EQ(back_edges, expected_edges);
+}
+
+TEST(TraversalTest, OutputDfsWithLoop) {
+  // Graph with a loop.
+  GraphDef graph = ::tensorflow::test::function::GDef(  //
+      {NDef("2", "Merge", {"1", "5"}, {}),              //
+       NDef("3", "Switch", {"2"}, {}),                  //
+       NDef("4", "Identity", {"3"}, {}),                //
+       NDef("5", "NextIteration", {"4"}, {}),           //
+       NDef("1", "Enter", {}, {}),                      //
+       NDef("6", "Exit", {"3"}, {})},                   //
+      /*funcs=*/{});
+
+  std::vector<const NodeDef*> start_nodes = {&graph.node(0)};
+
+  std::vector<string> pre_order;
+  std::vector<string> post_order;
+  std::vector<string> back_edges;
+
+  GraphTopologyView graph_view;
+  TF_CHECK_OK(graph_view.InitializeFromGraph(graph));
+  DfsTraversal(graph_view, start_nodes, TraversalDirection::kFollowOutputs,
+               MkCallbacks(&pre_order, &post_order, &back_edges));
+
+  const std::vector<string> expected_pre = {"2", "3", "6", "4", "5"};
+  const std::vector<string> expected_post = {"6", "5", "4", "3", "2"};
+  const std::vector<string> expected_edges = {"5->2"};
+
+  EXPECT_EQ(pre_order, expected_pre);
+  EXPECT_EQ(post_order, expected_post);
+  EXPECT_EQ(back_edges, expected_edges);
+}
+
+TEST(TraversalTest, DfsWithEnterPredicate) {
+  const string op = "OpIsNotImportantInThisTest";
+
+  GraphDef graph = ::tensorflow::test::function::GDef(  //
+      {NDef("1", op, {}, {}),                           //       2 -> 3
+       NDef("2", op, {"1"}, {}),                        // 1 -> /      \ -> 6
+       NDef("3", op, {"2"}, {}),                        //      \      /
+       NDef("4", op, {"1"}, {}),                        //       4 -> 5
+       NDef("5", op, {"4"}, {}),                        //
+       NDef("6", op, {"3", "5"}, {})},                  //
+      /*funcs=*/{});
+
+  // Do not enter the nodes '2' and '3'.
+  const auto enter = [](const NodeDef* node) {
+    return node->name() != "2" && node->name() != "3";
+  };
+
+  std::vector<const NodeDef*> start_nodes = {&graph.node(0)};
+
+  std::vector<string> pre_order;
+  std::vector<string> post_order;
+  std::vector<string> back_edges;
+
+  GraphTopologyView graph_view;
+  TF_CHECK_OK(graph_view.InitializeFromGraph(graph));
+  DfsTraversal(graph_view, start_nodes, TraversalDirection::kFollowOutputs,
+               DfsPredicates::Enter(enter),
+               MkCallbacks(&pre_order, &post_order, &back_edges));
+
+  const std::vector<string> expected_pre = {"1", "4", "5", "6"};
+  const std::vector<string> expected_post = {"6", "5", "4", "1"};
+
+  EXPECT_EQ(pre_order, expected_pre);
+  EXPECT_EQ(post_order, expected_post);
+  EXPECT_TRUE(back_edges.empty());
+}
+
+TEST(TraversalTest, DfsWithAdvancePredicate) {
+  const string op = "OpIsNotImportantInThisTest";
+
+  GraphDef graph = ::tensorflow::test::function::GDef(  //
+      {NDef("1", op, {}, {}),                           //       2 -> 3
+       NDef("2", op, {"1"}, {}),                        // 1 -> /      \ -> 6
+       NDef("3", op, {"2"}, {}),                        //      \      /
+       NDef("4", op, {"1"}, {}),                        //       4 -> 5
+       NDef("5", op, {"4"}, {}),                        //
+       NDef("6", op, {"3", "5"}, {})},                  //
+      {} /* empty function library*/);
+
+  // Do not advance from the nodes '2' and '3'.
+  const auto advance = [](const NodeDef* node) {
+    return node->name() != "2" && node->name() != "3";
+  };
+
+  std::vector<const NodeDef*> start_nodes = {&graph.node(0)};
+
+  std::vector<string> pre_order;
+  std::vector<string> post_order;
+  std::vector<string> back_edges;
+
+  GraphTopologyView graph_view;
+  TF_CHECK_OK(graph_view.InitializeFromGraph(graph));
+  DfsTraversal(graph_view, start_nodes, TraversalDirection::kFollowOutputs,
+               DfsPredicates::Advance(advance),
+               MkCallbacks(&pre_order, &post_order, &back_edges));
+
+  const std::vector<string> expected_pre = {"1", "4", "5", "6", "2"};
+  const std::vector<string> expected_post = {"6", "5", "4", "2", "1"};
+
+  EXPECT_EQ(pre_order, expected_pre);
+  EXPECT_EQ(post_order, expected_post);
+  EXPECT_TRUE(back_edges.empty());
 }
 
 }  // namespace
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 6fa139e5dfd633583b14eecf023fc0b18437917f..1a20e8628ec10131046ada57579b53b487c396d3 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -1012,7 +1012,16 @@ tf_kernel_library(
     hdrs = ["tile_functor.h"],
     gpu_srcs = [
         "tile_functor.h",
-        "tile_functor_gpu.cu.cc",
+        "tile_functor_gpu.h",
+        "tile_functor_gpu_bool.cu.cc",
+        "tile_functor_gpu_complex64.cu.cc",
+        "tile_functor_gpu_complex128.cu.cc",
+        "tile_functor_gpu_double.cu.cc",
+        "tile_functor_gpu_float.cu.cc",
+        "tile_functor_gpu_half.cu.cc",
+        "tile_functor_gpu_int16.cu.cc",
+        "tile_functor_gpu_int32.cu.cc",
+        "tile_functor_gpu_int64.cu.cc",
     ],
     prefix = "tile_ops",
     deps = ARRAY_DEPS,
@@ -2247,6 +2256,7 @@ tf_kernel_library(
     ],
     deps = [
         ":concat_lib",
+        ":fill_functor",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:list_ops_op_lib",
@@ -2303,6 +2313,7 @@ tf_kernel_library(
         "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/optimizers:meta_optimizer",
         "//tensorflow/core/grappler/utils:functions",
+        "//tensorflow/stream_executor:stream",
     ],
 )
 
@@ -2478,7 +2489,6 @@ tf_kernel_library(
     prefix = "encode_wav_op",
     deps = [
         ":bounds_check",
-        "//tensorflow/core:audio_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -2490,7 +2500,6 @@ tf_kernel_library(
     name = "decode_wav_op",
     prefix = "decode_wav_op",
     deps = [
-        "//tensorflow/core:audio_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -2513,6 +2522,7 @@ tf_cc_tests(
         ":eigen_helpers",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -3110,7 +3120,7 @@ tf_kernel_library(
         "//conditions:default": [],
     }),
     prefix = "sparse_matmul_op",
-    deps = MATH_DEPS + select({
+    deps = MATH_DEPS + [":eigen_contraction_kernel"] + select({
         ":xsmm": [
             "@libxsmm_archive//:xsmm_avx",
         ],
@@ -3271,7 +3281,15 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "scan_ops",
-    prefix = "scan_ops",
+    srcs = ["scan_ops.cc"],
+    hdrs = ["scan_ops.h"],
+    gpu_srcs = [
+        "scan_ops.h",
+        "scan_ops_gpu.h",
+        "scan_ops_gpu_double.cu.cc",
+        "scan_ops_gpu_float.cu.cc",
+        "scan_ops_gpu_half.cu.cc",
+    ],
     deps = MATH_DEPS + if_cuda(["@cub_archive//:cub"]),
 )
 
@@ -3696,7 +3714,15 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "depthwise_conv_op",
-    prefix = "depthwise_conv_op",
+    srcs = ["depthwise_conv_op.cc"],
+    hdrs = ["depthwise_conv_op.h"],
+    gpu_srcs = [
+        "depthwise_conv_op.h",
+        "depthwise_conv_op_gpu.h",
+        "depthwise_conv_op_gpu_double.cu.cc",
+        "depthwise_conv_op_gpu_float.cu.cc",
+        "depthwise_conv_op_gpu_half.cu.cc",
+    ],
     deps = [
         ":bounds_check",
         ":conv_ops",
@@ -3794,6 +3820,8 @@ tf_kernel_library(
     deps = NN_DEPS + if_cuda([
         ":reduction_ops",
         "@cub_archive//:cub",
+        "//tensorflow/core:stream_executor",
+        "//tensorflow/stream_executor/cuda:cuda_stream",
     ]),
 )
 
@@ -3846,7 +3874,21 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "topk_op",
-    prefix = "topk_op",
+    srcs = ["topk_op.cc"],
+    hdrs = ["topk_op.h"],
+    gpu_srcs = [
+        "topk_op.h",
+        "topk_op_gpu.h",
+        "topk_op_gpu_double.cu.cc",
+        "topk_op_gpu_float.cu.cc",
+        "topk_op_gpu_half.cu.cc",
+        "topk_op_gpu_int64.cu.cc",
+        "topk_op_gpu_int32.cu.cc",
+        "topk_op_gpu_int16.cu.cc",
+        "topk_op_gpu_uint16.cu.cc",
+        "topk_op_gpu_int8.cu.cc",
+        "topk_op_gpu_uint8.cu.cc",
+    ],
     deps = NN_DEPS + if_cuda(["@cub_archive//:cub"]),
 )
 
@@ -4445,7 +4487,10 @@ tf_kernel_library(
     deps = SPARSE_DEPS + [
         ":bounds_check",
         "//third_party/eigen3",
-    ],
+    ] + if_cuda([
+        ":reduction_ops",
+        "@cub_archive//:cub",
+    ]),
 )
 
 tf_kernel_library(
@@ -4570,6 +4615,7 @@ cc_library(
     srcs = ["sdca_internal.cc"],
     hdrs = ["sdca_internal.h"],
     deps = [
+        ":eigen_contraction_kernel",
         ":loss_updaters",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -5097,7 +5143,6 @@ tf_kernel_library(
     prefix = "spectrogram_op",
     deps = [
         ":spectrogram",
-        "//tensorflow/core:audio_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -5215,7 +5260,6 @@ tf_kernel_library(
     prefix = "mfcc_op",
     deps = [
         ":mfcc",
-        "//tensorflow/core:audio_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -6877,14 +6921,14 @@ tf_kernel_library(
     name = "summary_kernels",
     srcs = ["summary_kernels.cc"],
     deps = [
-        "//tensorflow/contrib/tensorboard/db:schema",
-        "//tensorflow/contrib/tensorboard/db:summary_db_writer",
-        "//tensorflow/contrib/tensorboard/db:summary_file_writer",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:summary_ops_op_lib",
         "//tensorflow/core/lib/db:sqlite",
+        "//tensorflow/core/summary:schema",
+        "//tensorflow/core/summary:summary_db_writer",
+        "//tensorflow/core/summary:summary_file_writer",
     ],
 )
 
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index aa9123582210bdf31993e9d8c58ba90cc02acc5e..d5bd36b4ceaa62f6c2f6928bbea704a0e6d01017 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -300,7 +300,7 @@ class Barrier : public ResourceBase {
     ready_queue_->Unref();
   }
 
-  string DebugString() override { return "A barrier"; }
+  string DebugString() const override { return "A barrier"; }
 
  protected:
   template <typename T>
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 35ddda0ec04da6f3b6f11606ecb019e38698c6d7..5ba461aa9de2a647962c653fb9ca0f199e9110be 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -233,7 +233,7 @@ class BatchResource : public ResourceBase {
     return Status::OK();
   }
 
-  string DebugString() final { return "BatchResource"; }
+  string DebugString() const final { return "BatchResource"; }
 
   // Ingests data from one invocation of the batch op. The data is enqueued to
   // be combined with others into a batch, asynchronously.
@@ -878,7 +878,7 @@ class UnbatchResource : public ResourceBase {
     timeout_enforcer_ = nullptr;
   }
 
-  string DebugString() final { return "UnbatchResource"; }
+  string DebugString() const final { return "UnbatchResource"; }
 
   Status Compute(OpKernelContext* context, AsyncOpKernel::DoneCallback done) {
     const Tensor& data_t = context->input(0);
@@ -1094,7 +1094,7 @@ class UnbatchGradResource : public ResourceBase {
  public:
   UnbatchGradResource() {}
 
-  string DebugString() final { return "UnbatchGradResource"; }
+  string DebugString() const final { return "UnbatchGradResource"; }
 
   // Flushes the information for one batch, given its context and done
   // callback. Clears all information about it from the available_tensors_.
diff --git a/tensorflow/core/kernels/bias_op.cc b/tensorflow/core/kernels/bias_op.cc
index d4f4b43d63b90c22abbbe82263b09353912010c8..9006fb46fd5bc5494935ce5f32cfb8363a08650c 100644
--- a/tensorflow/core/kernels/bias_op.cc
+++ b/tensorflow/core/kernels/bias_op.cc
@@ -153,13 +153,13 @@ class BiasOp : public BinaryOp<T> {
               bias.tensor<T, 1>().reshape(four_dims).broadcast(broad_cast_dims);
         } break;
         case 5: {
-          Eigen::DSizes<int32, 5> four_dims(1, channel, 1, 1, 1);
+          Eigen::DSizes<int32, 5> five_dims(1, channel, 1, 1, 1);
           Eigen::DSizes<int32, 5> broad_cast_dims(batch, 1, height, width,
                                                   depth);
           const Device& d = context->eigen_device<Device>();
           output->tensor<T, 5>().device(d) =
               input.tensor<T, 5>() +
-              bias.tensor<T, 1>().reshape(four_dims).broadcast(broad_cast_dims);
+              bias.tensor<T, 1>().reshape(five_dims).broadcast(broad_cast_dims);
         } break;
         default:
           OP_REQUIRES(context, false,
@@ -269,28 +269,24 @@ class BiasGradOp : public OpKernel {
       output->template flat<T>().setZero();
     } else {
       // Added by intel_tf to support NCHW on CPU regardless of MKL used or not.
-      // TODO(yongtang): Add 3/4/5 dimensional data support for NCHW format.
       if (data_format_ == FORMAT_NCHW) {
-        OP_REQUIRES(context, output_backprop.dims() == 4,
-                    errors::InvalidArgument(
-                        "NCHW format supports only 4D input/output tensor."));
-        Eigen::DSizes<Eigen::Index, 4> four_dims(batch, channel, height, width);
+        Eigen::DSizes<Eigen::Index, 3> three_dims(batch, channel,
+                                                  height * width * depth);
 #ifdef EIGEN_HAS_INDEX_LIST
         using idx0 = Eigen::type2index<0>;
         using idx2 = Eigen::type2index<2>;
-        using idx3 = Eigen::type2index<3>;
-        Eigen::IndexList<idx0, idx2, idx3> reduction_axes;
+        Eigen::IndexList<idx0, idx2> reduction_axes;
 #else
-        Eigen::array<Eigen::Index, 3> reduction_axes = {0, 2, 3};
+        Eigen::array<Eigen::Index, 2> reduction_axes = {0, 2};
 #endif
         output->template flat<T>().device(context->eigen_device<Device>()) =
             output_backprop.flat<T>()
                 .template cast<typename AccumulatorType<T>::type>()
-                .reshape(four_dims)
+                .reshape(three_dims)
                 .sum(reduction_axes)
                 .template cast<T>();  // End of code by intel_tf.
       } else {
-        Eigen::DSizes<Eigen::Index, 2> two_dims(batch * height * width,
+        Eigen::DSizes<Eigen::Index, 2> two_dims(batch * height * width * depth,
                                                 channel);
 #ifdef EIGEN_HAS_INDEX_LIST
         Eigen::IndexList<Eigen::type2index<0> > reduction_axis;
@@ -496,21 +492,21 @@ class BiasGradOp<GPUDevice, T> : public OpKernel {
 
   void ComputeWithCustomKernel(OpKernelContext* context,
                                const Tensor& output_backprop, int32 batch,
-                               int32 width, int32 height, int32 channel,
-                               Tensor* output) {
+                               int32 width, int32 height, int32 depth,
+                               int32 channel, Tensor* output) {
     BiasGradGPU<T>::compute(context->template eigen_device<Device>(),
                             output_backprop.template flat<T>().data(),
                             output->flat<T>().data(), batch, width, height,
-                            channel, data_format_);
+                            depth, channel, data_format_);
   }
 
   void ComputeWithReduceSum(OpKernelContext* context,
                             const Tensor& output_backprop, int32 batch,
-                            int32 width, int32 height, int32 channel,
-                            Tensor* output) {
+                            int32 width, int32 height, int32 depth,
+                            int32 channel, Tensor* output) {
     if (data_format_ == FORMAT_NCHW) {
       int32 row_count = batch * channel;
-      int32 col_count = height * width;
+      int32 col_count = height * width * depth;
       Tensor temp_grad_outputs;
       // For 'NCHW' format, we perform reduction twice: first HW, then N.
       TensorShape temp_grad_output_shape{row_count, col_count};
@@ -528,7 +524,7 @@ class BiasGradOp<GPUDevice, T> : public OpKernel {
                                      row_count, col_count);
     } else {
       // For 'NHWC', we simply apply reduction once on NHW.
-      int32 row_count = batch * height * width;
+      int32 row_count = batch * height * width * depth;
       int32 col_count = channel;
       BiasGradGPU<T>::DoColReduction(
           context, const_cast<T*>(output->flat<T>().data()),
@@ -561,7 +557,7 @@ class BiasGradOp<GPUDevice, T> : public OpKernel {
     int device_id = stream->parent()->device_ordinal();
     DataType dtype = output_backprop.dtype();
     BiasAddParams bias_parameters = {
-        {batch, height * width, channel},
+        {batch, height * width * depth, channel},
         data_format_,
         dtype,
         device_id,
@@ -576,7 +572,7 @@ class BiasGradOp<GPUDevice, T> : public OpKernel {
       stream->InitTimer(&timer);
       stream->ThenStartTimer(&timer);
       ComputeWithCustomKernel(context, output_backprop, batch, width, height,
-                              channel, output);
+                              depth, channel, output);
       stream->ThenStopTimer(&timer);
       uint64 elapsed_microseconds = timer.Microseconds();
       VLOG(1) << "BiasAddGrad " << bias_parameters.ToString()
@@ -589,7 +585,7 @@ class BiasGradOp<GPUDevice, T> : public OpKernel {
       // Try reduction and profile.
       stream->ThenStartTimer(&timer);
       ComputeWithReduceSum(context, output_backprop, batch, width, height,
-                           channel, output);
+                           depth, channel, output);
       stream->ThenStopTimer(&timer);
 
       elapsed_microseconds = timer.Microseconds();
@@ -610,11 +606,11 @@ class BiasGradOp<GPUDevice, T> : public OpKernel {
     // Choose the best algorithm based on autotune results.
     if (algo_config.get_mode() == BiasAddGradGPUMode::kReduction) {
       ComputeWithReduceSum(context, output_backprop, batch, width, height,
-                           channel, output);
+                           depth, channel, output);
     } else {
       // Default to the customized kernel.
       ComputeWithCustomKernel(context, output_backprop, batch, width, height,
-                              channel, output);
+                              depth, channel, output);
     }
   }
 
diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc
index 24fea8a8e6f10cea4f74e743c8aa2c6bfb49313f..006fa1dc712f7c06953f70e278fedaa3504bfcce 100644
--- a/tensorflow/core/kernels/bias_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc
@@ -195,10 +195,10 @@ __global__ void BiasGradNCHW_SharedAtomics(const T* output_backprop,
 template <typename T>
 void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
                              T* bias_backprop, int32 batch, int32 height,
-                             int32 width, int32 channel,
+                             int32 width, int32 depth, int32 channel,
                              TensorFormat data_format) {
   const int32 bias_size = channel;
-  const int32 image_size = height * width;
+  const int32 image_size = height * width * depth;
   const int32 total_count = batch * bias_size * image_size;
   if (total_count == 0) {
     return;
diff --git a/tensorflow/core/kernels/bias_op_gpu.h b/tensorflow/core/kernels/bias_op_gpu.h
index a0b2ce4f9b34b0b343de3d09374b07d554c57d15..372a403e6872dcfb0c41b0dafe5be045c3388054 100644
--- a/tensorflow/core/kernels/bias_op_gpu.h
+++ b/tensorflow/core/kernels/bias_op_gpu.h
@@ -39,7 +39,7 @@ template <typename T>
 struct BiasGradGPU {
   static void compute(const GPUDevice& device, const T* output_backprop,
                       T* bias_backprop, int32 batch, int32 height, int32 width,
-                      int32 channel, TensorFormat data_format);
+                      int32 depth, int32 channel, TensorFormat data_format);
 
   static void DoRowReduction(OpKernelContext* context, T* output,
                              const T* input, int rows, int cols);
diff --git a/tensorflow/core/kernels/boosted_trees/BUILD b/tensorflow/core/kernels/boosted_trees/BUILD
index 8f2c2dbe8a778353dff5e0b8823ac99de68282df..285cded181cb2014e50f96c957290d642fcb6810 100644
--- a/tensorflow/core/kernels/boosted_trees/BUILD
+++ b/tensorflow/core/kernels/boosted_trees/BUILD
@@ -31,7 +31,6 @@ tf_kernel_library(
     deps = [
         ":resource_ops",
         ":resources",
-        "//tensorflow/core:boosted_trees_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -60,7 +59,6 @@ tf_kernel_library(
     srcs = ["resource_ops.cc"],
     deps = [
         ":resources",
-        "//tensorflow/core:boosted_trees_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
@@ -72,7 +70,6 @@ tf_kernel_library(
     srcs = ["stats_ops.cc"],
     deps = [
         ":tree_helper",
-        "//tensorflow/core:boosted_trees_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
@@ -84,7 +81,6 @@ tf_kernel_library(
     deps = [
         ":resources",
         ":tree_helper",
-        "//tensorflow/core:boosted_trees_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
@@ -95,7 +91,6 @@ tf_kernel_library(
     name = "quantile_ops",
     srcs = ["quantile_ops.cc"],
     deps = [
-        "//tensorflow/core:boosted_trees_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/kernels/boosted_trees/quantiles:weighted_quantiles",
diff --git a/tensorflow/core/kernels/boosted_trees/quantiles/quantile_stream_resource.h b/tensorflow/core/kernels/boosted_trees/quantiles/quantile_stream_resource.h
index 1c31724272ab11a20ac6f72edd87a86105dd643e..965bf2c924c8791578c5f069e40d2d748e5f3978 100644
--- a/tensorflow/core/kernels/boosted_trees/quantiles/quantile_stream_resource.h
+++ b/tensorflow/core/kernels/boosted_trees/quantiles/quantile_stream_resource.h
@@ -37,15 +37,15 @@ class BoostedTreesQuantileStreamResource : public ResourceBase {
         epsilon_(epsilon),
         num_streams_(num_streams),
         max_elements_(max_elements) {
-          streams_.reserve(num_streams_);
-          boundaries_.reserve(num_streams_);
-          for (int64 idx = 0; idx < num_streams; ++idx) {
-            streams_.push_back(QuantileStream(epsilon, max_elements));
-            boundaries_.push_back(std::vector<float>());
-          }
-        }
-
-  string DebugString() override { return "QuantileStreamResource"; }
+    streams_.reserve(num_streams_);
+    boundaries_.reserve(num_streams_);
+    for (int64 idx = 0; idx < num_streams; ++idx) {
+      streams_.push_back(QuantileStream(epsilon, max_elements));
+      boundaries_.push_back(std::vector<float>());
+    }
+  }
+
+  string DebugString() const override { return "QuantileStreamResource"; }
 
   tensorflow::mutex* mutex() { return &mu_; }
 
diff --git a/tensorflow/core/kernels/boosted_trees/resources.cc b/tensorflow/core/kernels/boosted_trees/resources.cc
index 2798722536271380697539dca4d83ca865051da6..42df4848815db7a097a70b4f1713fd42484be438 100644
--- a/tensorflow/core/kernels/boosted_trees/resources.cc
+++ b/tensorflow/core/kernels/boosted_trees/resources.cc
@@ -31,7 +31,7 @@ BoostedTreesEnsembleResource::BoostedTreesEnsembleResource()
           protobuf::Arena::CreateMessage<boosted_trees::TreeEnsemble>(
               &arena_)) {}
 
-string BoostedTreesEnsembleResource::DebugString() {
+string BoostedTreesEnsembleResource::DebugString() const {
   return strings::StrCat("TreeEnsemble[size=", tree_ensemble_->trees_size(),
                          "]");
 }
diff --git a/tensorflow/core/kernels/boosted_trees/resources.h b/tensorflow/core/kernels/boosted_trees/resources.h
index f961ed38142709b01ba009a4d8fb3dab2fe757c4..3c7b2df9b08a2b8912c43b2439e28f34a64b38ef 100644
--- a/tensorflow/core/kernels/boosted_trees/resources.h
+++ b/tensorflow/core/kernels/boosted_trees/resources.h
@@ -48,7 +48,7 @@ class BoostedTreesEnsembleResource : public StampedResource {
  public:
   BoostedTreesEnsembleResource();
 
-  string DebugString() override;
+  string DebugString() const override;
 
   bool InitFromSerialized(const string& serialized, const int64 stamp_token);
 
diff --git a/tensorflow/core/kernels/conditional_accumulator_base.h b/tensorflow/core/kernels/conditional_accumulator_base.h
index 4a5ec6f0fb3c7272dd0684da3ce56e787848dd7d..2618ffbb099cd1619de826f6b0e4e5ae20982197 100644
--- a/tensorflow/core/kernels/conditional_accumulator_base.h
+++ b/tensorflow/core/kernels/conditional_accumulator_base.h
@@ -68,7 +68,7 @@ class ConditionalAccumulatorBase : public ResourceBase {
 
   const DataType& dtype() const { return dtype_; }
 
-  string DebugString() override { return "A conditional accumulator"; }
+  string DebugString() const override { return "A conditional accumulator"; }
 
   // SetGlobalStep is a modifier method for current_global_step.
   // It returns an InvalidArgument error if the new_global_step is less than
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index 4e3de33e83a34e0ec6a4c4d87f93127ec134c822..0df05ceb0266fba43dc23162a2d92c33b02c7fa2 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -102,6 +102,7 @@ struct LaunchConv2DBackpropFilterOp<CPUDevice, T> {
                   const Tensor& out_backprop, const Tensor& input,
                   int row_dilation, int col_dilation, int row_stride,
                   int col_stride, const Padding& padding,
+                  const std::vector<int64>& explicit_paddings,
                   Tensor* filter_backprop, TensorFormat data_format) {
     const CPUDevice& d = ctx->eigen_device<CPUDevice>();
     functor::SpatialConvolutionBackwardFilter<CPUDevice, T>()(
@@ -204,6 +205,15 @@ class Conv2DFastBackpropFilterOp : public OpKernel {
                 errors::InvalidArgument(
                     "Row and column strides should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(
+        context, padding_ != Padding::EXPLICIT,
+        errors::Unimplemented("Current CPU implementation does not support "
+                              "EXPLICIT padding yet."));
+    std::vector<int64> explicit_paddings;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("explicit_paddings", &explicit_paddings));
+    OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings,
+                                              /*num_dims=*/4, data_format_));
     OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
     OP_REQUIRES(context, dilations_.size() == 4,
                 errors::InvalidArgument("Sliding window dilations field must "
@@ -282,7 +292,8 @@ class Conv2DFastBackpropFilterOp : public OpKernel {
     LaunchConv2DBackpropFilterOp<Device, T>()(
         context, false, false, out_backprop, input,
         /*row_dilation=*/1, /*col_dilation=*/1, dims.spatial_dims[0].stride,
-        dims.spatial_dims[1].stride, padding_, filter_backprop, data_format_);
+        dims.spatial_dims[1].stride, padding_, /*explicit_paddings=*/{},
+        filter_backprop, data_format_);
   }
 
  private:
@@ -319,6 +330,15 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
                 errors::InvalidArgument(
                     "Row and column strides should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(
+        context, padding_ != Padding::EXPLICIT,
+        errors::Unimplemented("Current CPU implementation does not support "
+                              "EXPLICIT padding yet."));
+    std::vector<int64> explicit_paddings;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("explicit_paddings", &explicit_paddings));
+    OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings,
+                                              /*num_dims=*/4, data_format_));
     OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
     OP_REQUIRES(context, dilations_.size() == 4,
                 errors::InvalidArgument("Sliding window dilations field must "
@@ -587,6 +607,10 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
     use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("explicit_paddings", &explicit_paddings_));
+    OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
+                                              /*num_dims=*/4, data_format_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -626,13 +650,14 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
 
     launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop, input,
               dilation_rows, dilation_cols, stride_rows, stride_cols, padding_,
-              filter_backprop, data_format_);
+              explicit_paddings_, filter_backprop, data_format_);
   }
 
  private:
   std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
+  std::vector<int64> explicit_paddings_;
   bool use_cudnn_;
   TensorFormat data_format_;
   LaunchConv2DBackpropFilterOp<Device, T> launcher_;
@@ -646,7 +671,8 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
     OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
     const Tensor& out_backprop, const Tensor& input, int row_dilation,
     int col_dilation, int row_stride, int col_stride, const Padding& padding,
-    Tensor* filter_backprop, TensorFormat data_format) {
+    const std::vector<int64>& explicit_paddings, Tensor* filter_backprop,
+    TensorFormat data_format) {
   using se::dnn::AlgorithmConfig;
   using se::dnn::AlgorithmDesc;
   using se::dnn::ProfileResult;
@@ -661,35 +687,33 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
   TensorShape filter_shape = filter_backprop->shape();
 
   ConvBackpropDimensions dims;
-  OP_REQUIRES_OK(ctx, ConvBackpropComputeDimensionsV2(
-                          "Conv2DSlowBackpropFilter", /*num_spatial_dims=*/2,
-                          input.shape(), filter_shape, out_backprop.shape(),
-                          dilations, strides, padding, data_format, &dims));
-
-  // TODO(yangzihao): The padding computations should be done in
-  // GetWindowedOutputSize() functions.
-  const int padding_rows =
-      (padding == VALID)
-          ? 0
-          : std::max<int>(0, (dims.spatial_dims[0].output_size - 1) *
-                                     dims.spatial_dims[0].stride +
-                                 (dims.spatial_dims[0].filter_size - 1) *
-                                     dims.spatial_dims[0].dilation +
-                                 1 - dims.spatial_dims[0].input_size);
-  const int padding_cols =
-      (padding == VALID)
-          ? 0
-          : std::max<int>(0, (dims.spatial_dims[1].output_size - 1) *
-                                     dims.spatial_dims[1].stride +
-                                 (dims.spatial_dims[1].filter_size - 1) *
-                                     dims.spatial_dims[1].dilation +
-                                 1 - dims.spatial_dims[1].input_size);
-
-  // TODO(zhengxq): cuDNN only supports equal padding on both sides, so only
-  // calling it when that is true. Remove this check when (if?) cuDNN starts
-  // supporting different padding.
-  bool rows_odd = (padding_rows % 2 != 0);
-  bool cols_odd = (padding_cols % 2 != 0);
+  OP_REQUIRES_OK(
+      ctx, ConvBackpropComputeDimensionsV2(
+               "Conv2DSlowBackpropFilter", /*num_spatial_dims=*/2,
+               input.shape(), filter_shape, out_backprop.shape(), dilations,
+               strides, padding, explicit_paddings, data_format, &dims));
+
+  int64 padding_top = -1, padding_bottom = -1;
+  int64 padding_left = -1, padding_right = -1;
+  if (padding == EXPLICIT) {
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &padding_top,
+                             &padding_bottom);
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &padding_left,
+                             &padding_right);
+  }
+  int64 expected_out_rows, expected_out_cols;
+  // The function is guaranteed to succeed because we checked the output and
+  // padding was valid earlier.
+  TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
+      dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+      row_dilation, row_stride, padding, &expected_out_rows, &padding_top,
+      &padding_bottom));
+  DCHECK_EQ(dims.spatial_dims[0].output_size, expected_out_rows);
+  TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
+      dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+      col_dilation, col_stride, padding, &expected_out_cols, &padding_left,
+      &padding_right));
+  DCHECK_EQ(dims.spatial_dims[1].output_size, expected_out_cols);
 
   auto* stream = ctx->op_device_context()->stream();
   OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
@@ -711,7 +735,7 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
       dims.spatial_dims[0].filter_size == 1 &&
       dims.spatial_dims[1].filter_size == 1 && !is_grouped_convolution &&
       dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 &&
-      data_format == FORMAT_NHWC) {
+      data_format == FORMAT_NHWC && (padding == VALID || padding == SAME)) {
     const uint64 m = dims.in_depth;
     const uint64 k = dims.batch_size * dims.spatial_dims[0].input_size *
                      dims.spatial_dims[1].input_size;
@@ -779,31 +803,43 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
     return;
   }
 
+  const int64 common_padding_rows = std::min(padding_top, padding_bottom);
+  const int64 common_padding_cols = std::min(padding_left, padding_right);
   Tensor compatible_input;
-  if (rows_odd || cols_odd) {
-    // If a padding dimension is odd, we have one more element on the right
-    // side or the bottom side. This is unsupported in cudnn. Therefore,
-    // we pad that extra element and make it compatible.
+  if (padding_top != padding_bottom || padding_left != padding_right) {
+    // Pad the input in the same way we did during the forward pass, so that
+    // cuDNN receives the same input during the backward pass function as it did
+    // during the forward pass function.
+    const int64 padding_rows_diff = std::abs(padding_bottom - padding_top);
+    const int64 padding_cols_diff = std::abs(padding_right - padding_left);
+    const int64 new_in_rows =
+        dims.spatial_dims[0].input_size + padding_rows_diff;
+    const int64 new_in_cols =
+        dims.spatial_dims[1].input_size + padding_cols_diff;
+    const int64 input_pad_top = padding_top - common_padding_rows;
+    const int64 input_pad_bottom = padding_bottom - common_padding_rows;
+    const int64 input_pad_left = padding_left - common_padding_cols;
+    const int64 input_pad_right = padding_right - common_padding_cols;
     OP_REQUIRES_OK(
         ctx, ctx->allocate_temp(
                  DataTypeToEnum<T>::value,
-                 ShapeFromFormat(data_format, dims.batch_size,
-                                 dims.spatial_dims[0].input_size + rows_odd,
-                                 dims.spatial_dims[1].input_size + cols_odd,
-                                 dims.in_depth),
+                 ShapeFromFormat(data_format, dims.batch_size, new_in_rows,
+                                 new_in_cols, dims.in_depth),
                  &compatible_input));
 
     functor::PadInput<GPUDevice, T, int, 4>()(
         ctx->template eigen_device<GPUDevice>(), To32Bit(input.tensor<T, 4>()),
-        {{0, 0}}, {{rows_odd, cols_odd}},
+        {{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
+        {{static_cast<int>(input_pad_bottom),
+          static_cast<int>(input_pad_right)}},
         To32Bit(compatible_input.tensor<T, 4>()), data_format);
   } else {
     compatible_input = input;
   }
 
-  CHECK(padding_rows >= 0 && padding_cols >= 0)
-      << "Negative row or col paddings: (" << padding_rows << ", "
-      << padding_cols << ")";
+  CHECK(common_padding_rows >= 0 && common_padding_cols >= 0)  // Crash OK
+      << "Negative row or col paddings: (" << common_padding_rows << ", "
+      << common_padding_cols << ")";
   se::dnn::BatchDescriptor input_desc;
   input_desc.set_count(dims.batch_size)
       .set_height(GetTensorDim(compatible_input, data_format, 'H'))
@@ -826,8 +862,8 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
       .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation)
       .set_vertical_filter_stride(dims.spatial_dims[0].stride)
       .set_horizontal_filter_stride(dims.spatial_dims[1].stride)
-      .set_zero_padding_height(padding_rows / 2)
-      .set_zero_padding_width(padding_cols / 2)
+      .set_zero_padding_height(common_padding_rows)
+      .set_zero_padding_width(common_padding_cols)
       .set_group_count(dims.in_depth / filter_shape.dim_size(2));
 
   // NOTE(zhengxq):
@@ -922,8 +958,8 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
         dims.spatial_dims[1].dilation}},   // dilation_cols
       {{dims.spatial_dims[0].stride,       // stride_rows
         dims.spatial_dims[1].stride}},     // stride_cols
-      {{padding_rows,                      // padding_rows
-        padding_cols}},                    // padding_cols
+      {{common_padding_rows,               // padding_rows
+        common_padding_cols}},             // padding_cols
       dtype,                               // tensor datatype
       device_id,                           // device_id
   };
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 9f983ed8166d51a720b4ea0ff360a974a7b4fb86..74b97b98648dc5f2a32d4755ac08d731af5549e8 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -106,8 +106,9 @@ struct LaunchConv2DBackpropInputOp<CPUDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& out_backprop, const Tensor& filter,
                   int row_dilation, int col_dilation, int row_stride,
-                  int col_stride, const Padding& padding, Tensor* in_backprop,
-                  TensorFormat data_format) {
+                  int col_stride, const Padding& padding,
+                  const std::vector<int64>& explicit_paddings,
+                  Tensor* in_backprop, TensorFormat data_format) {
     const CPUDevice& d = ctx->eigen_device<CPUDevice>();
     functor::SpatialConvolutionBackwardInput<CPUDevice, T>()(
         d, in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
@@ -220,6 +221,15 @@ class Conv2DFastBackpropInputOp : public OpKernel {
                 errors::InvalidArgument(
                     "Current Eigen and libxsmm implementations do not "
                     "yet support dilation rates larger than 1."));
+    OP_REQUIRES(
+        context, padding_ != Padding::EXPLICIT,
+        errors::Unimplemented("Current CPU implementation does not support "
+                              "EXPLICIT padding yet."));
+    std::vector<int64> explicit_paddings;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("explicit_paddings", &explicit_paddings));
+    OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings,
+                                              /*num_dims=*/4, data_format_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -286,7 +296,8 @@ class Conv2DFastBackpropInputOp : public OpKernel {
     LaunchConv2DBackpropInputOp<Device, T>()(
         context, false, false, out_backprop, filter,
         /*row_dilation=*/1, /*col_dilation=*/1, dims.spatial_dims[0].stride,
-        dims.spatial_dims[1].stride, padding_, in_backprop, data_format_);
+        dims.spatial_dims[1].stride, padding_, /*explicit_paddings=*/{},
+        in_backprop, data_format_);
   }
 
  private:
@@ -336,6 +347,15 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
                 errors::InvalidArgument(
                     "Current libxsmm and customized CPU implementations do "
                     "not yet support dilation rates larger than 1."));
+    OP_REQUIRES(
+        context, padding_ != Padding::EXPLICIT,
+        errors::Unimplemented("Current CPU implementation does not support "
+                              "EXPLICIT padding yet."));
+    std::vector<int64> explicit_paddings;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("explicit_paddings", &explicit_paddings));
+    OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings,
+                                              /*num_dims=*/4, data_format_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -661,6 +681,16 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
     use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    if (!std::is_same<Device, GPUDevice>::value) {
+      OP_REQUIRES(
+          context, padding_ != Padding::EXPLICIT,
+          errors::Unimplemented("Current CPU implementation does not support "
+                                "EXPLICIT padding yet."));
+    }
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("explicit_paddings", &explicit_paddings_));
+    OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
+                                              /*num_dims=*/4, data_format_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -694,13 +724,14 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
 
     launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop, filter,
               dilation_rows, dilation_cols, stride_rows, stride_cols, padding_,
-              in_backprop, data_format_);
+              explicit_paddings_, in_backprop, data_format_);
   }
 
  private:
   std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
+  std::vector<int64> explicit_paddings_;
   bool use_cudnn_;
   TensorFormat data_format_;
   LaunchConv2DBackpropInputOp<Device, T> launcher_;
@@ -714,7 +745,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
     OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
     const Tensor& out_backprop, const Tensor& filter, int row_dilation,
     int col_dilation, int row_stride, int col_stride, const Padding& padding,
-    Tensor* in_backprop, TensorFormat data_format) {
+    const std::vector<int64>& explicit_paddings, Tensor* in_backprop,
+    TensorFormat data_format) {
   using se::dnn::AlgorithmConfig;
   using se::dnn::AlgorithmDesc;
   using se::dnn::ProfileResult;
@@ -731,35 +763,33 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
 
   const TensorShape& filter_shape = filter.shape();
   ConvBackpropDimensions dims;
-  OP_REQUIRES_OK(ctx, ConvBackpropComputeDimensionsV2(
-                          "Conv2DSlowBackpropInput", /*num_spatial_dims=*/2,
-                          input_shape, filter_shape, out_backprop.shape(),
-                          dilations, strides, padding, data_format, &dims));
-
-  // TODO(yangzihao): The padding computations should be done in
-  // GetWindowedOutputSize() functions.
-  const int padding_rows =
-      (padding == VALID)
-          ? 0
-          : std::max<int>(0, (dims.spatial_dims[0].output_size - 1) *
-                                     dims.spatial_dims[0].stride +
-                                 (dims.spatial_dims[0].filter_size - 1) *
-                                     dims.spatial_dims[0].dilation +
-                                 1 - dims.spatial_dims[0].input_size);
-  const int padding_cols =
-      (padding == VALID)
-          ? 0
-          : std::max<int>(0, (dims.spatial_dims[1].output_size - 1) *
-                                     dims.spatial_dims[1].stride +
-                                 (dims.spatial_dims[1].filter_size - 1) *
-                                     dims.spatial_dims[1].dilation +
-                                 1 - dims.spatial_dims[1].input_size);
-
-  // TODO(keveman): cuDNN only supports equal padding on both sides, so only
-  // calling it when that is true. Remove this check when (if?) cuDNN starts
-  // supporting different padding.
-  bool rows_odd = (padding_rows % 2 != 0);
-  bool cols_odd = (padding_cols % 2 != 0);
+  OP_REQUIRES_OK(
+      ctx, ConvBackpropComputeDimensionsV2(
+               "Conv2DSlowBackpropInput", /*num_spatial_dims=*/2, input_shape,
+               filter_shape, out_backprop.shape(), dilations, strides, padding,
+               explicit_paddings, data_format, &dims));
+
+  int64 padding_top = -1, padding_bottom = -1;
+  int64 padding_left = -1, padding_right = -1;
+  if (padding == EXPLICIT) {
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &padding_top,
+                             &padding_bottom);
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &padding_left,
+                             &padding_right);
+  }
+  int64 expected_out_rows, expected_out_cols;
+  // The function is guaranteed to succeed because we checked the output and
+  // padding was valid earlier.
+  TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
+      dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+      row_dilation, row_stride, padding, &expected_out_rows, &padding_top,
+      &padding_bottom));
+  DCHECK_EQ(dims.spatial_dims[0].output_size, expected_out_rows);
+  TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
+      dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+      col_dilation, col_stride, padding, &expected_out_cols, &padding_left,
+      &padding_right));
+  DCHECK_EQ(dims.spatial_dims[1].output_size, expected_out_cols);
 
   auto* stream = ctx->op_device_context()->stream();
   OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
@@ -779,7 +809,7 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
   if (dims.spatial_dims[0].filter_size == 1 &&
       dims.spatial_dims[1].filter_size == 1 && !is_grouped_convolution &&
       dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 &&
-      data_format == FORMAT_NHWC) {
+      data_format == FORMAT_NHWC && (padding == VALID || padding == SAME)) {
     // 1x1 filter, so call cublas directly.
     const uint64 m = dims.batch_size * dims.spatial_dims[0].input_size *
                      dims.spatial_dims[1].input_size;
@@ -841,22 +871,28 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
     return;
   }
 
+  const int64 common_padding_rows = std::min(padding_top, padding_bottom);
+  const int64 common_padding_cols = std::min(padding_left, padding_right);
   TensorShape compatible_input_shape;
-  if (rows_odd || cols_odd) {
-    // If a padding dimension is odd, we have one more element on the right
-    // side or the bottom side. This is unsupported in cudnn. Therefore,
-    // we pad that extra element and make it compatible.
+  if (padding_top != padding_bottom || padding_left != padding_right) {
+    // Pad the input in the same way we did during the forward pass, so that
+    // cuDNN receives the same input during the backward pass function as it did
+    // during the forward pass function.
+    const int64 padding_rows_diff = std::abs(padding_bottom - padding_top);
+    const int64 padding_cols_diff = std::abs(padding_right - padding_left);
+    const int64 new_in_rows =
+        dims.spatial_dims[0].input_size + padding_rows_diff;
+    const int64 new_in_cols =
+        dims.spatial_dims[1].input_size + padding_cols_diff;
     compatible_input_shape = ShapeFromFormat(
-        data_format, dims.batch_size,
-        dims.spatial_dims[0].input_size + rows_odd,
-        dims.spatial_dims[1].input_size + cols_odd, dims.in_depth);
+        data_format, dims.batch_size, new_in_rows, new_in_cols, dims.in_depth);
   } else {
     compatible_input_shape = input_shape;
   }
 
-  CHECK(padding_rows >= 0 && padding_cols >= 0)
-      << "Negative row or col paddings: (" << padding_rows << ", "
-      << padding_cols << ")";
+  CHECK(common_padding_rows >= 0 && common_padding_cols >= 0)  // Crash OK
+      << "Negative row or col paddings: (" << common_padding_rows << ", "
+      << common_padding_cols << ")";
   se::dnn::BatchDescriptor input_desc;
   input_desc.set_count(dims.batch_size)
       .set_height(GetTensorDim(compatible_input_shape, data_format, 'H'))
@@ -879,8 +915,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
       .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation)
       .set_vertical_filter_stride(dims.spatial_dims[0].stride)
       .set_horizontal_filter_stride(dims.spatial_dims[1].stride)
-      .set_zero_padding_height(padding_rows / 2)
-      .set_zero_padding_width(padding_cols / 2)
+      .set_zero_padding_height(common_padding_rows)
+      .set_zero_padding_width(common_padding_cols)
       .set_group_count(dims.in_depth / filter_shape.dim_size(2));
 
   // NOTE(keveman):
@@ -971,8 +1007,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
         dims.spatial_dims[1].dilation}},   // dilation_cols
       {{dims.spatial_dims[0].stride,       // stride_rows
         dims.spatial_dims[1].stride}},     // stride_cols
-      {{padding_rows,                      // padding_rows
-        padding_cols}},                    // padding_cols
+      {{common_padding_rows,               // padding_rows
+        common_padding_cols}},             // padding_cols
       dtype,                               // tensor data type
       device_id,                           // device_id
   };
@@ -1041,7 +1077,7 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
     return;
   }
 
-  if (rows_odd || cols_odd) {
+  if (padding_top != padding_bottom || padding_left != padding_right) {
     Tensor in_backprop_remove_padding;
     OP_REQUIRES_OK(
         ctx, ctx->allocate_temp(
@@ -1053,12 +1089,18 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
                                  GetTensorDim(input_shape, data_format, 'C')),
                  &in_backprop_remove_padding));
 
-    // Remove the padding for odd rows or cols.
+    // Remove the padding that was added to the input shape above.
+    const int64 input_pad_top = padding_top - common_padding_rows;
+    const int64 input_pad_bottom = padding_bottom - common_padding_rows;
+    const int64 input_pad_left = padding_left - common_padding_cols;
+    const int64 input_pad_right = padding_right - common_padding_cols;
     functor::PadInput<GPUDevice, T, int, 4>()(
         ctx->template eigen_device<GPUDevice>(),
         To32Bit(const_cast<const Tensor&>(pre_transformed_in_backprop)
                     .tensor<T, 4>()),
-        {{0, 0}}, {{-rows_odd, -cols_odd}},
+        {{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
+        {{static_cast<int>(-input_pad_bottom),
+          static_cast<int>(-input_pad_right)}},
         To32Bit(in_backprop_remove_padding.tensor<T, 4>()), FORMAT_NCHW);
 
     pre_transformed_in_backprop = in_backprop_remove_padding;
diff --git a/tensorflow/core/kernels/conv_grad_ops.cc b/tensorflow/core/kernels/conv_grad_ops.cc
index 507720c998d752f7157be5340445693bf8849173..0fd7550830333f749312f5db54d3ffd6ffa22a4a 100644
--- a/tensorflow/core/kernels/conv_grad_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_ops.cc
@@ -52,24 +52,23 @@ int ConvBackpropDimensions::SpatialPadding(const Padding& padding,
                                        1 - input_size(dim)));
 }
 
-// The V2 version computes windowed output size with arbitrary dilation_rate,
-// while the original version only handles the cases where dilation_rates equal
-// to 1.
-Status ConvBackpropExtractAndVerifyDimensionV2(
+namespace {
+
+Status ConvBackpropExtractAndVerifyDimension(
     StringPiece label, const TensorShape& input_shape,
     const TensorShape& filter_shape, const TensorShape& output_shape,
     const gtl::ArraySlice<int32>& dilations, const std::vector<int32>& strides,
-    Padding padding, int spatial_dim, int filter_spatial_dim,
-    ConvBackpropSpatialDimension* dim) {
+    Padding padding, int64 padding_before, int64 padding_after, int spatial_dim,
+    int filter_spatial_dim, ConvBackpropSpatialDimension* dim) {
   dim->input_size = input_shape.dim_size(spatial_dim);
   dim->filter_size = filter_shape.dim_size(filter_spatial_dim);
   dim->output_size = output_shape.dim_size(spatial_dim);
   dim->stride = strides[spatial_dim];
   dim->dilation = dilations[spatial_dim];
-  int64 out_size = 0, pad_size = 0;
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeV2(dim->input_size, dim->filter_size,
-                                             dim->dilation, dim->stride,
-                                             padding, &out_size, &pad_size));
+  int64 out_size = 0;
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerboseV2(
+      dim->input_size, dim->filter_size, dim->dilation, dim->stride, padding,
+      &out_size, &padding_before, &padding_after));
   if (dim->output_size != out_size) {
     return errors::InvalidArgument(
         label, ": Size of out_backprop doesn't match computed: ", "actual = ",
@@ -79,10 +78,13 @@ Status ConvBackpropExtractAndVerifyDimensionV2(
         " stride: ", dim->stride, " dilation: ", dim->dilation);
   }
 
+  // TODO(reedwm): Correctly handle explicit padding here. The rest of the
+  // fields set on 'dim' are only used in XLA. TensorFlow ops do not yet support
+  // explicit padding for XLA.
   int64 effective_filter_size = (dim->filter_size - 1) * dim->dilation + 1;
   dim->expanded_output_size = (dim->output_size - 1) * dim->stride + 1;
   const auto padded_out_size = dim->input_size + effective_filter_size - 1;
-  dim->pad_before = effective_filter_size - 1 - pad_size;
+  dim->pad_before = effective_filter_size - 1 - padding_before;
   dim->pad_after =
       padded_out_size - dim->expanded_output_size - dim->pad_before;
   VLOG(2) << label << ": expanded_out = " << dim->expanded_output_size
@@ -94,22 +96,14 @@ Status ConvBackpropExtractAndVerifyDimensionV2(
   return Status::OK();
 }
 
-Status ConvBackpropExtractAndVerifyDimension(
-    StringPiece label, const TensorShape& input_shape,
-    const TensorShape& filter_shape, const TensorShape& output_shape,
-    const std::vector<int32>& strides, Padding padding, int spatial_dim,
-    int filter_spatial_dim, ConvBackpropSpatialDimension* dim) {
-  static constexpr std::array<int32, 5> one_dilations = {{1, 1, 1, 1, 1}};
-  return ConvBackpropExtractAndVerifyDimensionV2(
-      label, input_shape, filter_shape, output_shape, one_dilations, strides,
-      padding, spatial_dim, filter_spatial_dim, dim);
-}
+}  // namespace
 
 Status ConvBackpropComputeDimensionsV2(
     StringPiece label, int num_spatial_dims, const TensorShape& input_shape,
     const TensorShape& filter_shape, const TensorShape& out_backprop_shape,
     const gtl::ArraySlice<int32>& dilations, const std::vector<int32>& strides,
-    Padding padding, TensorFormat data_format, ConvBackpropDimensions* dims) {
+    Padding padding, const std::vector<int64>& explicit_paddings,
+    TensorFormat data_format, ConvBackpropDimensions* dims) {
   // The + 2 in the following line is for the batch and feature dimensions.
   const int num_dims = num_spatial_dims + 2;
   if (input_shape.dims() != num_dims) {
@@ -152,9 +146,15 @@ Status ConvBackpropComputeDimensionsV2(
   dims->spatial_dims.resize(num_spatial_dims);
   for (int i = 0; i < num_spatial_dims; ++i) {
     int image_dim = GetTensorSpatialDimIndex(num_dims, data_format, i);
-    TF_RETURN_IF_ERROR(ConvBackpropExtractAndVerifyDimensionV2(
+    int64 padding_before = -1, padding_after = -1;
+    if (padding == EXPLICIT) {
+      padding_before = explicit_paddings[2 * image_dim];
+      padding_after = explicit_paddings[2 * image_dim + 1];
+    }
+    TF_RETURN_IF_ERROR(ConvBackpropExtractAndVerifyDimension(
         label, input_shape, filter_shape, out_backprop_shape, dilations,
-        strides, padding, image_dim, i, &dims->spatial_dims[i]));
+        strides, padding, padding_before, padding_after, image_dim, i,
+        &dims->spatial_dims[i]));
   }
   return Status::OK();
 }
@@ -169,7 +169,8 @@ Status ConvBackpropComputeDimensions(StringPiece label, int num_spatial_dims,
   static constexpr std::array<int32, 5> one_dilations = {{1, 1, 1, 1, 1}};
   return ConvBackpropComputeDimensionsV2(
       label, num_spatial_dims, input_shape, filter_shape, out_backprop_shape,
-      one_dilations, strides, padding, data_format, dims);
+      one_dilations, strides, padding, /*explicit_paddings=*/{}, data_format,
+      dims);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_grad_ops.h b/tensorflow/core/kernels/conv_grad_ops.h
index 9551959463bf1f32010b436671ff7eed1daa9d82..c8e8cf28c55e266575738dfe9ef65d588dd0dd2f 100644
--- a/tensorflow/core/kernels/conv_grad_ops.h
+++ b/tensorflow/core/kernels/conv_grad_ops.h
@@ -176,8 +176,9 @@ struct LaunchConv2DBackpropInputOp {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& out_backprop, const Tensor& filter,
                   int row_dilation, int col_dilation, int row_stride,
-                  int col_stride, const Padding& padding, Tensor* in_backprop,
-                  TensorFormat data_format);
+                  int col_stride, const Padding& padding,
+                  const std::vector<int64>& explicit_paddings,
+                  Tensor* in_backprop, TensorFormat data_format);
 };
 
 template <typename Device, typename T>
@@ -186,6 +187,7 @@ struct LaunchConv2DBackpropFilterOp {
                   const Tensor& out_backprop, const Tensor& input,
                   int row_dilation, int col_dilation, int row_stride,
                   int col_stride, const Padding& padding,
+                  const std::vector<int64>& explicit_paddings,
                   Tensor* filter_backprop, TensorFormat data_format);
 };
 
@@ -195,7 +197,8 @@ struct LaunchConv2DBackpropInputOp<Eigen::GpuDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& input, const Tensor& filter, int row_dilation,
                   int col_dilation, int row_stride, int col_stride,
-                  const Padding& padding, Tensor* output,
+                  const Padding& padding,
+                  const std::vector<int64>& explicit_paddings, Tensor* output,
                   TensorFormat data_format);
 };
 
@@ -205,6 +208,7 @@ struct LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T> {
                   const Tensor& out_backprop, const Tensor& input,
                   int row_dilation, int col_dilation, int row_stride,
                   int col_stride, const Padding& padding,
+                  const std::vector<int64>& explicit_paddings,
                   Tensor* filter_backprop, TensorFormat data_format);
 };
 #endif  // GOOGLE_CUDA
@@ -217,6 +221,8 @@ struct ConvBackpropSpatialDimension {
   int64 output_size;
   int64 stride;
   int64 dilation;
+
+  // The following fields are valid only if the padding is not EXPLICIT.
   int64 expanded_output_size;
 
   // Number of padding elements to be added before/after this dimension of
@@ -248,7 +254,7 @@ struct ConvBackpropDimensions {
 
 // Common code between implementations of Conv?DBackpropInput and
 // Conv?DBackpropFilter. Verifies that the dimensions all match, and computes
-// sizes/padding for the spatial dimensions.
+// sizes/padding for the spatial dimensions. Does not support explicit padding.
 Status ConvBackpropComputeDimensions(StringPiece label, int num_spatial_dims,
                                      const TensorShape& input_shape,
                                      const TensorShape& filter_shape,
@@ -257,13 +263,15 @@ Status ConvBackpropComputeDimensions(StringPiece label, int num_spatial_dims,
                                      Padding padding, TensorFormat data_format,
                                      ConvBackpropDimensions* dims);
 
-// The V2 version computes the same outputs with arbitrary dilation rate.
+// The V2 version computes the same outputs with arbitrary dilation rate and
+// supports explicit padding.
 // TODO(b/67112639): Merge V2 versions and the original versions eventually.
 Status ConvBackpropComputeDimensionsV2(
     StringPiece label, int num_spatial_dims, const TensorShape& input_shape,
     const TensorShape& filter_shape, const TensorShape& out_backprop_shape,
     const gtl::ArraySlice<int32>& dilations, const std::vector<int32>& strides,
-    Padding padding, TensorFormat data_format, ConvBackpropDimensions* dims);
+    Padding padding, const std::vector<int64>& explicit_paddings,
+    TensorFormat data_format, ConvBackpropDimensions* dims);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_CONV_GRAD_OPS_H_
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 562a9c8aed5850418aa8acecec35a7860ae99921..ca46da6ba38044b50aa6299b82f9b9cacd87bb4c 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -1152,11 +1152,11 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
     }
 
     ConvBackpropDimensions dims;
-    OP_REQUIRES_OK(context,
-                   ConvBackpropComputeDimensionsV2(
-                       "Conv3DBackpropInputOp", /*num_spatial_dims=*/3,
-                       input_shape, filter_shape, out_backprop_shape, dilation_,
-                       stride_, padding_, data_format_, &dims));
+    OP_REQUIRES_OK(context, ConvBackpropComputeDimensionsV2(
+                                "Conv3DBackpropInputOp", /*num_spatial_dims=*/3,
+                                input_shape, filter_shape, out_backprop_shape,
+                                dilation_, stride_, padding_,
+                                /*explicit_paddings=*/{}, data_format_, &dims));
 
     Tensor* in_backprop;
     OP_REQUIRES_OK(context,
@@ -1537,11 +1537,12 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
     }
 
     ConvBackpropDimensions dims;
-    OP_REQUIRES_OK(context,
-                   ConvBackpropComputeDimensionsV2(
-                       "Conv3DBackpropFilterOp", /*num_spatial_dims=*/3,
-                       input_shape, filter_shape, out_backprop_shape, dilation_,
-                       stride_, padding_, data_format_, &dims));
+    OP_REQUIRES_OK(
+        context,
+        ConvBackpropComputeDimensionsV2(
+            "Conv3DBackpropFilterOp", /*num_spatial_dims=*/3, input_shape,
+            filter_shape, out_backprop_shape, dilation_, stride_, padding_,
+            /*explicit_paddings=*/{}, data_format_, &dims));
 
     Tensor* filter_backprop;
     OP_REQUIRES_OK(context,
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index dfba15792dcf5d293d894027b51c56df31a0e520..a8138fd0a737b40c6b7f38760cd2297a753749b4 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -122,7 +122,8 @@ struct LaunchConv2DOp<CPUDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& input, const Tensor& filter, int row_dilation,
                   int col_dilation, int row_stride, int col_stride,
-                  const Padding& padding, Tensor* output,
+                  const Padding& padding,
+                  const std::vector<int64>& explicit_paddings, Tensor* output,
                   TensorFormat data_format) {
     if (data_format != FORMAT_NHWC) {
       ctx->SetStatus(
@@ -130,6 +131,11 @@ struct LaunchConv2DOp<CPUDevice, T> {
                                 "NHWC tensor format for now."));
       return;
     }
+    // TODO(reedwm): Enable explicit padding on the CPU.
+    OP_REQUIRES(
+        ctx, padding != Padding::EXPLICIT,
+        errors::Unimplemented("Generic conv implementation does not support "
+                              "EXPLICIT padding yet."));
     const int64 in_depth = GetTensorDim(input, data_format, 'C');
     OP_REQUIRES(ctx, in_depth == filter.dim_size(2),
                 errors::Unimplemented("Generic conv implementation does not "
@@ -274,6 +280,10 @@ Status InitConv2DParameters(const OpKernelConstruction* context,
   TF_RETURN_IF_ERROR(context->GetAttr("dilations", &params->dilations));
   TF_RETURN_IF_ERROR(context->GetAttr("strides", &params->strides));
   TF_RETURN_IF_ERROR(context->GetAttr("padding", &params->padding));
+  if (context->HasAttr("explicit_paddings")) {
+    TF_RETURN_IF_ERROR(
+        context->GetAttr("explicit_paddings", &params->explicit_paddings));
+  }
   string data_format_string;
   TF_RETURN_IF_ERROR(context->GetAttr("data_format", &data_format_string));
   TF_REQUIRES(FormatFromString(data_format_string, &params->data_format),
@@ -313,6 +323,10 @@ Status InitConv2DParameters(const OpKernelConstruction* context,
       dilation_h > 0 && dilation_w > 0,
       errors::InvalidArgument("Dilated rates should be larger than 0."));
 
+  TF_RETURN_IF_ERROR(CheckValidPadding(params->padding,
+                                       params->explicit_paddings,
+                                       /*num_dims=*/4, data_format));
+
   return Status::OK();
 }
 
@@ -381,14 +395,22 @@ Status ComputeConv2DDimension(const Conv2DParameters& params,
   const int dilation_cols =
       GetTensorDim(params.dilations, params.data_format, 'W');
 
+  int64 pad_rows_before, pad_rows_after, pad_cols_before, pad_cols_after;
+  if (params.padding == Padding::EXPLICIT) {
+    GetExplicitPaddingForDim(params.explicit_paddings, params.data_format, 'H',
+                             &pad_rows_before, &pad_rows_after);
+    GetExplicitPaddingForDim(params.explicit_paddings, params.data_format, 'W',
+                             &pad_cols_before, &pad_cols_after);
+  }
+
   // Compute windowed output sizes for rows and columns.
-  int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeV2(
+  int64 out_rows = 0, out_cols = 0;
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerboseV2(
       input_rows, filter_rows, dilation_rows, stride_rows, params.padding,
-      &out_rows, &pad_rows));
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeV2(
+      &out_rows, &pad_rows_before, &pad_rows_after));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerboseV2(
       input_cols, filter_cols, dilation_cols, stride_cols, params.padding,
-      &out_cols, &pad_cols));
+      &out_cols, &pad_cols_before, &pad_cols_after));
 
   dimensions->batch = batch;
   dimensions->input_rows = input_rows;
@@ -404,8 +426,10 @@ Status ComputeConv2DDimension(const Conv2DParameters& params,
   dimensions->dilation_cols = dilation_cols;
   dimensions->out_rows = out_rows;
   dimensions->out_cols = out_cols;
-  dimensions->pad_rows = pad_rows;
-  dimensions->pad_cols = pad_cols;
+  dimensions->pad_rows_before = pad_rows_before;
+  dimensions->pad_rows_after = pad_rows_after;
+  dimensions->pad_cols_before = pad_cols_before;
+  dimensions->pad_cols_after = pad_cols_after;
 
   return Status::OK();
 }
@@ -463,33 +487,35 @@ class Conv2DOp : public BinaryOp<T> {
     }
 
 #ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS
-    if (LaunchXsmmConvOp<Device, T>::Run(
+    if (params_.padding != EXPLICIT &&
+        LaunchXsmmConvOp<Device, T>::Run(
             context, input, filter, dimensions.batch, dimensions.input_rows,
             dimensions.input_cols, dimensions.in_depth, dimensions.filter_rows,
-            dimensions.filter_cols, dimensions.pad_rows, dimensions.pad_cols,
-            dimensions.out_rows, dimensions.out_cols, dimensions.out_depth,
-            dimensions.dilation_rows, dimensions.dilation_cols,
-            dimensions.stride_rows, dimensions.stride_cols, output,
-            params_.data_format)) {
+            dimensions.filter_cols, dimensions.pad_rows_before,
+            dimensions.pad_cols_before, dimensions.out_rows,
+            dimensions.out_cols, dimensions.out_depth, dimensions.dilation_rows,
+            dimensions.dilation_cols, dimensions.stride_rows,
+            dimensions.stride_cols, output, params_.data_format)) {
       return;
     }
 #endif
 
-    if (LaunchDeepConvOp<Device, T>::Run(
+    if (params_.padding != EXPLICIT &&
+        LaunchDeepConvOp<Device, T>::Run(
             context, input, filter, dimensions.batch, dimensions.input_rows,
             dimensions.input_cols, dimensions.in_depth, dimensions.filter_rows,
-            dimensions.filter_cols, dimensions.pad_rows, dimensions.pad_cols,
-            dimensions.out_rows, dimensions.out_cols, dimensions.out_depth,
-            dimensions.dilation_rows, dimensions.dilation_cols,
-            dimensions.stride_rows, dimensions.stride_cols, output,
-            params_.data_format)) {
+            dimensions.filter_cols, dimensions.pad_rows_before,
+            dimensions.pad_cols_before, dimensions.out_rows,
+            dimensions.out_cols, dimensions.out_depth, dimensions.dilation_rows,
+            dimensions.dilation_cols, dimensions.stride_rows,
+            dimensions.stride_cols, output, params_.data_format)) {
       return;
     }
 
     launcher_(context, use_cudnn_, cudnn_use_autotune_, input, filter,
               dimensions.dilation_rows, dimensions.dilation_cols,
               dimensions.stride_rows, dimensions.stride_cols, params_.padding,
-              output, params_.data_format);
+              params_.explicit_paddings, output, params_.data_format);
   }
 
  private:
@@ -551,7 +577,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
     const Tensor& input_param, const Tensor& filter, int row_dilation,
     int col_dilation, int row_stride, int col_stride, const Padding& padding,
-    Tensor* output, TensorFormat data_format) {
+    const std::vector<int64>& explicit_paddings, Tensor* output,
+    TensorFormat data_format) {
   using se::dnn::AlgorithmConfig;
   using se::dnn::AlgorithmDesc;
   using se::dnn::ProfileResult;
@@ -580,7 +607,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
   bool is_grouped_convolution = patch_depths != in_depths;
   if (patch_rows == 1 && patch_cols == 1 && !is_grouped_convolution &&
       row_dilation == 1 && col_dilation == 1 && row_stride == 1 &&
-      col_stride == 1 && data_format == FORMAT_NHWC) {
+      col_stride == 1 && data_format == FORMAT_NHWC &&
+      (padding == VALID || padding == SAME)) {
     // 1x1 filter, so call cublas directly.
     const uint64 m = in_batch * in_rows * in_cols;
     const uint64 k = patch_depths;
@@ -634,49 +662,78 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     return;
   }
 
-  int padding_rows = 0;
-  int padding_cols = 0;
   const int64 out_batch = GetTensorDim(*output, data_format, 'N');
   const int64 out_rows = GetTensorDim(*output, data_format, 'H');
   const int64 out_cols = GetTensorDim(*output, data_format, 'W');
   const int64 out_depths = GetTensorDim(*output, data_format, 'C');
-  if (padding == SAME) {
-    // Total padding on rows and cols is
-    // Pr = (R' - 1) * S + (Kr - 1) * Dr + 1 - R
-    // Pc = (C' - 1) * S + (Kc - 1) * Dc + 1 - C
-    // where (R', C') are output dimensions, (R, C) are input dimensions, S
-    // is stride, (Dr, Dc) are dilations, (Kr, Kc) are filter dimensions.
-    // We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top
-    // and Pc - Pc/2 on the bottom.  When Pr or Pc is odd, this means
-    // we pad more on the right and bottom than on the top and left.
-    padding_rows =
-        std::max<int>(0, (out_rows - 1) * row_stride +
-                             (patch_rows - 1) * row_dilation + 1 - in_rows);
-    padding_cols =
-        std::max<int>(0, (out_cols - 1) * col_stride +
-                             (patch_cols - 1) * col_dilation + 1 - in_cols);
-    const bool rows_odd = (padding_rows % 2 != 0);
-    const bool cols_odd = (padding_cols % 2 != 0);
-    if (rows_odd || cols_odd) {
-      Tensor transformed_input;
-      int64 new_in_rows = in_rows + rows_odd;
-      int64 new_in_cols = in_cols + cols_odd;
-      OP_REQUIRES_OK(
-          ctx,
-          ctx->allocate_temp(DataTypeToEnum<T>::value,
-                             ShapeFromFormat(data_format, in_batch, new_in_rows,
-                                             new_in_cols, in_depths),
-                             &transformed_input));
-
-      functor::PadInput<GPUDevice, T, int, 4>()(
-          ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 4>()),
-          {{0, 0}}, {{rows_odd, cols_odd}},
-          To32Bit(transformed_input.tensor<T, 4>()), data_format);
-
-      input = transformed_input;
-      in_rows = new_in_rows;
-      in_cols = new_in_cols;
+  int64 padding_top = -1, padding_bottom = -1;
+  int64 padding_left = -1, padding_right = -1;
+  if (padding == EXPLICIT) {
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &padding_top,
+                             &padding_bottom);
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &padding_left,
+                             &padding_right);
+  }
+  int64 out_rows_check, out_cols_check;
+  Status status = GetWindowedOutputSizeVerboseV2(
+      in_rows, patch_rows, row_dilation, row_stride, padding, &out_rows_check,
+      &padding_top, &padding_bottom);
+  // The status is guaranteed to be OK because we checked the output and padding
+  // was valid earlier.
+  TF_CHECK_OK(status);
+  DCHECK_EQ(out_rows, out_rows_check);
+  status = GetWindowedOutputSizeVerboseV2(in_cols, patch_cols, col_dilation,
+                                          col_stride, padding, &out_cols_check,
+                                          &padding_left, &padding_right);
+  TF_CHECK_OK(status);
+  DCHECK_EQ(out_cols, out_cols_check);
+
+  const int64 common_padding_rows = std::min(padding_top, padding_bottom);
+  const int64 common_padding_cols = std::min(padding_left, padding_right);
+  if (padding_top != padding_bottom || padding_left != padding_right) {
+    // cuDNN only supports padding the same amount on the left and right sides,
+    // and on the top and bottom sides. So we manually create a new padded
+    // input tensor such that we can pass it to cuDNN.
+
+    // TODO(reedwm): In some cases, we can avoid an allocation even if the two
+    // padding sides are different. For example, if the input is 2x2, the filter
+    // is 1x1, the stride is 2, and the padding is (1, 0, 1, 0), the result is
+    // equivalent to as if the padding is (1, 1, 1, 1). Changing the padding in
+    // such a way would allow us to avoid the allocation.
+    Tensor transformed_input;
+    const int64 padding_rows_diff = std::abs(padding_bottom - padding_top);
+    const int64 padding_cols_diff = std::abs(padding_right - padding_left);
+    const int64 new_in_rows = in_rows + padding_rows_diff;
+    const int64 new_in_cols = in_cols + padding_cols_diff;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(
+                            DataTypeToEnum<T>::value,
+                            ShapeFromFormat(data_format, in_batch, new_in_rows,
+                                            new_in_cols, in_depths),
+                            &transformed_input));
+
+    const int64 input_pad_top = padding_top - common_padding_rows;
+    const int64 input_pad_bottom = padding_bottom - common_padding_rows;
+    const int64 input_pad_left = padding_left - common_padding_cols;
+    const int64 input_pad_right = padding_right - common_padding_cols;
+    bool in_bounds =
+        FastBoundsCheck(input_pad_top, std::numeric_limits<int>::max()) &&
+        FastBoundsCheck(input_pad_bottom, std::numeric_limits<int>::max()) &&
+        FastBoundsCheck(input_pad_left, std::numeric_limits<int>::max()) &&
+        FastBoundsCheck(input_pad_right, std::numeric_limits<int>::max());
+    if (!in_bounds) {
+      ctx->SetStatus(errors::InvalidArgument("Padding is too large."));
+      return;
     }
+    functor::PadInput<GPUDevice, T, int, 4>()(
+        ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 4>()),
+        {{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
+        {{static_cast<int>(input_pad_bottom),
+          static_cast<int>(input_pad_right)}},
+        To32Bit(transformed_input.tensor<T, 4>()), data_format);
+
+    input = transformed_input;
+    in_rows = new_in_rows;
+    in_cols = new_in_cols;
   }
 
   if (data_format == FORMAT_NHWC) {
@@ -698,9 +755,9 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     }
   }
 
-  CHECK(padding_rows >= 0 && padding_cols >= 0)
-      << "Negative row or col paddings: (" << padding_rows << ", "
-      << padding_cols << ")";
+  CHECK(common_padding_rows >= 0 && common_padding_cols >= 0)  // Crash OK
+      << "Negative row or col paddings: (" << common_padding_rows << ", "
+      << common_padding_cols << ")";
   se::dnn::BatchDescriptor input_desc;
   input_desc.set_count(in_batch)
       .set_feature_map_count(in_depths)
@@ -723,8 +780,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
       .set_horizontal_dilation_rate(col_dilation)
       .set_vertical_filter_stride(row_stride)
       .set_horizontal_filter_stride(col_stride)
-      .set_zero_padding_height(padding_rows / 2)
-      .set_zero_padding_width(padding_cols / 2)
+      .set_zero_padding_height(common_padding_rows)
+      .set_zero_padding_width(common_padding_cols)
       .set_group_count(in_depths / patch_depths);
 
   Tensor transformed_filter;
@@ -767,23 +824,23 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
   int device_id = stream->parent()->device_ordinal();
   DataType dtype = input.dtype();
   ConvParameters conv_parameters = {
-      in_batch,          // batch
-      in_depths,         // in_depths
-      {{in_rows,         // in_rows
-        in_cols}},       // in_cols
-      FORMAT_NCHW,       // compute_data_format
-      out_depths,        // out_depths
-      {{patch_rows,      // filter_rows
-        patch_cols,      // filter_cols
-        patch_depths}},  // filter_depths
-      {{row_dilation,    // dilation_rows
-        col_dilation}},  // dilation_cols
-      {{row_stride,      // stride_rows
-        col_stride}},    // stride_cols
-      {{padding_rows,    // padding_rows
-        padding_cols}},  // padding_cols
-      dtype,             // tensor datatype
-      device_id,         // device_id
+      in_batch,                 // batch
+      in_depths,                // in_depths
+      {{in_rows,                // in_rows
+        in_cols}},              // in_cols
+      FORMAT_NCHW,              // compute_data_format
+      out_depths,               // out_depths
+      {{patch_rows,             // filter_rows
+        patch_cols,             // filter_cols
+        patch_depths}},         // filter_depths
+      {{row_dilation,           // dilation_rows
+        col_dilation}},         // dilation_cols
+      {{row_stride,             // stride_rows
+        col_stride}},           // stride_cols
+      {{common_padding_rows,    // padding_rows
+        common_padding_cols}},  // padding_cols
+      dtype,                    // tensor datatype
+      device_id,                // device_id
   };
   AlgorithmConfig algorithm_config;
   if (cudnn_use_autotune &&
diff --git a/tensorflow/core/kernels/conv_ops.h b/tensorflow/core/kernels/conv_ops.h
index 7ec878e0b2fc6eaae2a89610a9f8491689705f0c..105a4b1b825e304175d62c1723aeb46154b46a96 100644
--- a/tensorflow/core/kernels/conv_ops.h
+++ b/tensorflow/core/kernels/conv_ops.h
@@ -36,7 +36,8 @@ struct LaunchConv2DOp {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& input, const Tensor& filter, int row_dilation,
                   int col_dilation, int row_stride, int col_stride,
-                  const Padding& padding, Tensor* output,
+                  const Padding& padding,
+                  const std::vector<int64>& explicit_paddings, Tensor* output,
                   TensorFormat data_format);
 };
 
@@ -46,7 +47,8 @@ struct LaunchConv2DOp<Eigen::GpuDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& input, const Tensor& filter, int row_dilation,
                   int col_dilation, int row_stride, int col_stride,
-                  const Padding& padding, Tensor* output,
+                  const Padding& padding,
+                  const std::vector<int64>& explicit_paddings, Tensor* output,
                   TensorFormat data_format);
 };
 #endif  // GOOGLE_CUDA
@@ -63,7 +65,7 @@ struct Im2ColBufferResource : public ResourceBase {
   // the buffer memory held by this resource.
   mutex mu;
   T* data;
-  string DebugString() { return "Im2ColBufferResource"; }
+  string DebugString() const { return "Im2ColBufferResource"; }
 };
 
 // Convolution parameters specified by Op attributes.
@@ -72,6 +74,7 @@ struct Conv2DParameters {
   std::vector<int32> strides;
   Padding padding;
   TensorFormat data_format;
+  std::vector<int64> explicit_paddings;
 };
 
 // Convolution dimensions inferred from parameters, input and filter tensors.
@@ -94,8 +97,10 @@ struct Conv2DDimensions {
 
   int64 out_rows;
   int64 out_cols;
-  int64 pad_rows;
-  int64 pad_cols;
+  int64 pad_rows_before;
+  int64 pad_rows_after;
+  int64 pad_cols_before;
+  int64 pad_cols_after;
 };
 
 // Initializes and validates Conv2D parameters configured by OpKernel
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index ae4132bb0acef649eb1c3ee1abd443c288e61370..fc93915e165169cb048b75008e4f4449d5161e35 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -770,7 +770,15 @@ class FusedConv2DOpTest : public OpsTestBase {
     ASSERT_EQ(conv_2d.dtype(), fused_conv_2d.dtype());
     ASSERT_EQ(conv_2d.shape(), fused_conv_2d.shape());
 
-    test::ExpectClose(conv_2d, fused_conv_2d, /*atol=*/1e-6);
+    // NOTE(intel-tf): When filter_size is equal to the input image size,
+    // conv2d essentially is element-wise multiplication followed by
+    // a full sum reduction, which causes larger numerical error
+    // than usual cases.
+    if (image_width == filter_size && image_height == filter_size) {
+      test::ExpectClose(conv_2d, fused_conv_2d, /*atol=*/1e-4);
+    } else {
+      test::ExpectClose(conv_2d, fused_conv_2d, /*atol=*/1e-6);
+    }
   }
 
   void VerifyFusedBatchNormTensorsNear(int depth, int image_width,
@@ -812,7 +820,15 @@ class FusedConv2DOpTest : public OpsTestBase {
     ASSERT_EQ(conv_2d.dtype(), fused_conv_2d.dtype());
     ASSERT_EQ(conv_2d.shape(), fused_conv_2d.shape());
 
-    test::ExpectClose(conv_2d, fused_conv_2d, /*atol=*/1e-6);
+    // NOTE(intel-tf): When filter_size is equal to the input image size,
+    // conv2d essentially is element-wise multiplication followed by
+    // a full sum reduction, which causes larger numerical error
+    // than usual cases.
+    if (image_width == filter_size && image_height == filter_size) {
+      test::ExpectClose(conv_2d, fused_conv_2d, /*atol=*/1e-4);
+    } else {
+      test::ExpectClose(conv_2d, fused_conv_2d, /*atol=*/1e-6);
+    }
   }
 
   // Verifies that computing Conv2D+BiasAdd in a graph is identical to
@@ -1481,6 +1497,26 @@ BM_FusedConv2DWithBatchNormAndRelu(32, 32, 32, 128, 3, 3, 1024, cpu,
                                    "3x3 /b 32");
 
 #if GOOGLE_CUDA
+// -------------------------------------------------------------------------- //
+// 1x1 Convolution
+// -------------------------------------------------------------------------- //
+
+BM_Conv2D(8, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 8");
+BM_Conv2D(16, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 16");
+BM_Conv2D(32, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 32");
+
+BM_Conv2DWithBiasAndRelu(8, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 8");
+BM_Conv2DWithBiasAndRelu(16, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 16");
+BM_Conv2DWithBiasAndRelu(32, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 32");
+
+BM_FusedConv2DWithBiasAndRelu(8, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 8");
+BM_FusedConv2DWithBiasAndRelu(16, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 16");
+BM_FusedConv2DWithBiasAndRelu(32, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 32");
+
+// -------------------------------------------------------------------------- //
+// 3x3 Convolution
+// -------------------------------------------------------------------------- //
+
 BM_Conv2D(8, 32, 32, 128, 3, 3, 1024, gpu, "3x3 /b 8");
 BM_Conv2D(16, 32, 32, 128, 3, 3, 1024, gpu, "3x3 /b 16");
 BM_Conv2D(32, 32, 32, 128, 3, 3, 1024, gpu, "3x3 /b 32");
diff --git a/tensorflow/core/kernels/cuda_solvers.cc b/tensorflow/core/kernels/cuda_solvers.cc
index a59baaa96fc73cc442287dfb4550bc2f6932956b..39d0a998fdcfe0710af97e404e142955e57a7c2b 100644
--- a/tensorflow/core/kernels/cuda_solvers.cc
+++ b/tensorflow/core/kernels/cuda_solvers.cc
@@ -692,8 +692,8 @@ static inline Status GetrsBatchedImpl(
     SolverFnT solver, CudaSolver* cuda_solver, OpKernelContext* context,
     cublasHandle_t cublas_handle, cublasOperation_t trans, int n, int nrhs,
     const Scalar* const host_a_dev_ptrs[], int lda, const int* dev_pivots,
-    const Scalar* const host_b_dev_ptrs[], int ldb,
-    DeviceLapackInfo* dev_lapack_info, int batch_size) {
+    const Scalar* const host_b_dev_ptrs[], int ldb, int* host_lapack_info,
+    int batch_size) {
   mutex_lock lock(handle_map_mutex);
   using CudaScalar = typename CUDAComplexT<Scalar>::type;
   ScratchSpace<uint8> dev_a_dev_ptrs =
@@ -714,7 +714,7 @@ static inline Status GetrsBatchedImpl(
       cublas_handle, trans, n, nrhs,
       reinterpret_cast<const CudaScalar* const*>(dev_a_dev_ptrs.data()), lda,
       dev_pivots, reinterpret_cast<CudaScalar**>(dev_b_dev_ptrs.mutable_data()),
-      ldb, dev_lapack_info->mutable_data(), batch_size));
+      ldb, host_lapack_info, batch_size));
   return Status::OK();
 }
 
@@ -723,13 +723,13 @@ static inline Status GetrsBatchedImpl(
   Status CudaSolver::GetrsBatched(                                             \
       cublasOperation_t trans, int n, int nrhs,                                \
       const Scalar* const host_a_dev_ptrs[], int lda, const int* dev_pivots,   \
-      const Scalar* const host_b_dev_ptrs[], int ldb,                          \
-      DeviceLapackInfo* dev_lapack_info, int batch_size) {                     \
+      const Scalar* const host_b_dev_ptrs[], int ldb, int* host_lapack_info,   \
+      int batch_size) {                                                        \
     return GetrsBatchedImpl(reinterpret_cast<getrs_##type_prefix*>(            \
                                 BLAS_SOLVER_FN(getrsBatched, type_prefix)),    \
                             this, context_, cublas_handle_, trans, n, nrhs,    \
                             host_a_dev_ptrs, lda, dev_pivots, host_b_dev_ptrs, \
-                            ldb, dev_lapack_info, batch_size);                 \
+                            ldb, host_lapack_info, batch_size);                \
   }
 
 TF_CALL_LAPACK_TYPES(GETRS_BATCHED_INSTANCE);
diff --git a/tensorflow/core/kernels/cuda_solvers.h b/tensorflow/core/kernels/cuda_solvers.h
index 2c30d036df71f917f7e302141f577a49ed4c5112..1fc344731c28df2e2d4cb9e931accfc0ca4592ed 100644
--- a/tensorflow/core/kernels/cuda_solvers.h
+++ b/tensorflow/core/kernels/cuda_solvers.h
@@ -235,13 +235,14 @@ class CudaSolver {
                       int batch_size) TF_MUST_USE_RESULT;
 
   // Batched linear solver using LU factorization from getrfBatched.
-  // See:
+  // Notice that lapack_info is returned on the host, as opposed to
+  // most of the other functions that return it on the device. See:
   // http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getrsbatched
   template <typename Scalar>
   Status GetrsBatched(cublasOperation_t trans, int n, int nrhs,
                       const Scalar* const dev_Aarray[], int lda,
                       const int* devIpiv, const Scalar* const dev_Barray[],
-                      int ldb, DeviceLapackInfo* dev_lapack_info,
+                      int ldb, int* host_lapack_info,
                       int batch_size) TF_MUST_USE_RESULT;
 
   // Computes matrix inverses for a batch of small matrices. Uses the outputs
diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index d37f5fb9daea21737bb787521385d3090125b6bf..196494cbcf8b7f4f670599241d5bdbb1c29c7cd1 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -743,7 +743,7 @@ Status DoBackward(
     /* forward inputs */
     const Tensor* input, const Tensor* input_h, const Tensor* input_c,
     const Tensor* params,
-    /* forward outptus */
+    /* forward outputs */
     const Tensor* output, const Tensor* output_h, const Tensor* output_c,
     /* backprop inputs */
     const Tensor* output_backprop, const Tensor* output_h_backprop,
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index f00b38e732a7835896a275d14507e75eade05fa1..535f49cff804b6d210b8b4cbf757a458cab2d342 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -614,7 +614,9 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
      public:
       MemoryCache() = default;
 
-      string DebugString() override { return "CacheDataset::MemoryCache"; }
+      string DebugString() const override {
+        return "CacheDataset::MemoryCache";
+      }
 
       // Marks the cache as completed.
       void Complete() {
diff --git a/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc b/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc
index a07eaebdf9d645fba51945d7bd3e79b72b5e5dc2..83eeed7892b2bfd75d508b99feaab70a987fc06a 100644
--- a/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc
@@ -106,7 +106,7 @@ class MaterializedDatasetResource : public ResourceBase {
       const std::vector<PartialTensorShape>& output_shapes)
       : output_dtypes_(output_dtypes), output_shapes_(output_shapes) {}
 
-  string DebugString() override {
+  string DebugString() const override {
     return "Materialized IndexedDataset resource";
   }
 
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
index ef75c844565874aa32369f3325be5da1075e7323..a6348e464701482c6ed077728c5f9ca345e8095a 100644
--- a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -259,7 +259,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
                                             params.dataset->batch_size_)) {
         std::vector<string> components =
             str_util::Split(params.prefix, "::", str_util::SkipEmpty());
-        prefix_end_ = components.back();
+        key_prefix_ = components.back();
       }
 
       ~Iterator() override {
@@ -397,8 +397,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         const auto& stats_aggregator = ctx->stats_aggregator();
         if (stats_aggregator) {
           stats_aggregator->AddScalar(
-              strings::StrCat(prefix_end_, "::active_parallel_calls"),
-              static_cast<float>(num_calls_));
+              strings::StrCat(key_prefix_, "::thread_utilization"),
+              static_cast<float>(num_calls_) /
+                  static_cast<float>(num_parallel_calls_->value));
         }
         cond_var_->notify_all();
       }
@@ -637,18 +638,13 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
               num_calls_++;
             }
           }
-          const std::shared_ptr<StatsAggregator>& stats_aggregator =
-              ctx->stats_aggregator();
+          const auto& stats_aggregator = ctx->stats_aggregator();
           if (stats_aggregator) {
             mutex_lock l(*mu_);
-            // TODO(shivaniagrawal): add `parallel_calls_utilization` in the
-            // monitoring code or as histogram at fixed time intervals.
             stats_aggregator->AddScalar(
-                strings::StrCat(prefix_end_, "::active_parallel_calls"),
-                static_cast<float>(num_calls_));
-            stats_aggregator->AddScalar(
-                strings::StrCat(prefix_end_, "::num_parallel_calls"),
-                static_cast<float>(num_parallel_calls_->value));
+                strings::StrCat(key_prefix_, "::thread_utilization"),
+                static_cast<float>(num_calls_) /
+                    static_cast<float>(num_parallel_calls_->value));
           }
           for (const auto& call : new_calls) {
             CallFunction(ctx, call.first, call.second);
@@ -803,7 +799,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       int64 waiting_ GUARDED_BY(*mu_) = 0;
       // Identifies the maximum number of batch results to store.
       int64 max_batch_results_ GUARDED_BY(*mu_);
-      string prefix_end_;
+      string key_prefix_;
       std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
diff --git a/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
index 76ab33fe98887dafd69a45e80ee6794d7044384b..87263b5606e9f080cb8fdf8189fe0fcf9cbd38c2 100644
--- a/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
@@ -186,6 +186,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
 
         Status s = instantiated_captured_func_->Run(ctx, std::move(args),
                                                     &state_and_output);
+        DCHECK(state_and_output.size() <=
+               dataset()->state_types_.size() + output_dtypes().size());
         if (s.ok()) {
           state_.clear();
           size_t i = 0;
diff --git a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
index 8ae45ed5c9d9fe199ef392a1430f359172ec5c73..fab3cab7da51e3d77ef88faf9bbf9c123c601885 100644
--- a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
@@ -51,7 +51,7 @@ class ThreadPoolResource : public ResourceBase {
 
   int32 NumThreads() { return thread_pool_.NumThreads(); }
 
-  string DebugString() override { return "ThreadPoolResource"; }
+  string DebugString() const override { return "ThreadPoolResource"; }
 
  private:
   thread::ThreadPool thread_pool_;
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 9f5881563b5db2b6b5a678b777789091756a6e7a..81e26d35c06ebbfcc329d25c51d8a02543f64350 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -231,7 +231,7 @@ class IteratorResource : public ResourceBase {
     return Status::OK();
   }
 
-  string DebugString() override { return "Iterator resource"; }
+  string DebugString() const override { return "Iterator resource"; }
 
   const DataTypeVector& output_dtypes() const { return output_dtypes_; }
 
diff --git a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
index ba2125a66eb98985ebd0ae8f55bfc239997ad6df..05528e0ee09c58362928ade514bd357385fb9753 100644
--- a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
+++ b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
@@ -59,7 +59,7 @@ class MultiDeviceIterator : public ResourceBase {
     DCHECK(lib_ != nullptr);
   }
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat("MultiDeviceIterator for ", devices_.size(),
                            " devices");
   }
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
index 9c50d8050a82397f1578ab3f577ef5ad77f81767..04cc48a0be588a98bf9cd214a1a8741ad48ccf29 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -40,6 +40,8 @@ namespace tensorflow {
 namespace data {
 namespace {
 
+static const char* const kOptimizerName = "tf_data_meta_optimizer";
+
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 class OptimizeDatasetOp : public UnaryDatasetOpKernel {
@@ -286,31 +288,6 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       (*meta_graph_def.mutable_collection_def())["train_op"] = collection_def;
 
       // Create Grappler item.
-      tensorflow::ConfigProto config;
-      RewriterConfig& rewriter_config =
-          *config.mutable_graph_options()->mutable_rewrite_options();
-      for (const string& optimization : optimizations_) {
-        rewriter_config.add_optimizers(optimization);
-      }
-      // If no optimizations were specified, supply a non-existent
-      // optimization to prevent Grappler from applying the default set of
-      // optimizations as some of them do not work out of the box at the
-      // moment (e.g. because we have no cost model for dataset ops).
-      if (optimizations_.empty()) {
-        rewriter_config.add_optimizers("non-existent");
-      } else {
-        // If we apply custom dataset optimizers, explicitly trigger a subset of
-        // standard grappler optimizations to further optimize modified dataset
-        // graphs (e.g. performing constant folding on merged functions,
-        // removing unused graph nodes)
-        // TODO(b/118175421): This should be part of the tf.data optimization
-        // pass manager.
-        // TODO(b/120437209): Apply `constfold` optimization when it is fixed.
-        for (const auto& optimizer :
-             {"pruning", "function", "shape", "arithmetic", "dependency"}) {
-          rewriter_config.add_optimizers(optimizer);
-        }
-      }
       tensorflow::grappler::ItemConfig item_config;
       item_config.apply_optimizations = true;
       std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
@@ -319,13 +296,21 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       std::unordered_map<string, tensorflow::DeviceProperties> device_map;
       tensorflow::grappler::VirtualCluster cluster(device_map);
 
-      // Run optimizer.
-      if (VLOG_IS_ON(2)) {
-        LOG(INFO) << "Performing the following optimizations:";
-        for (const string& optimization : optimizations_) {
-          LOG(INFO) << "  " << optimization;
-        }
+      // Run data optimizer using grappler's meta optimizer.
+      tensorflow::ConfigProto config;
+      RewriterConfig& rewriter_config =
+          *config.mutable_graph_options()->mutable_rewrite_options();
+      rewriter_config.add_optimizers(kOptimizerName);
+
+      auto custom_optimizer = rewriter_config.add_custom_optimizers();
+      custom_optimizer->set_name(kOptimizerName);
+      auto* custom_optimizations_list =
+          (*custom_optimizer->mutable_parameter_map())["optimizers"]
+              .mutable_list();
+      for (const auto& opt : optimizations_) {
+        custom_optimizations_list->add_s(opt);
       }
+
       TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
           *grappler_item, config, ctx->device(), &cluster, graph_def));
 
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 74791669ce99c065f0e10304fd8c09de4b1d92f8..dc2663d1e0c701412573902fc4ef6c91604bf15b 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include <atomic>
 #include <deque>
+#include <memory>
 #include <utility>
 
 #include "tensorflow/core/common_runtime/function.h"
@@ -21,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -43,12 +45,7 @@ namespace {
 //
 // Furthermore, this class favors modularity over extended functionality. In
 // particular, it refrains from implementing configurable buffering of output
-// elements and prefetching of input iterators, relying on other parts of
-// tf.data to provide this functionality if necessary.
-//
-// The above design choices were made with automated optimizations in mind,
-// isolating the degree of parallelism as the single tunable knob of this
-// implementation.
+// elements and prefetching of input iterators.
 class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit ParallelInterleaveDatasetOp(OpKernelConstruction* ctx)
@@ -209,7 +206,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
                 false /* low_latency_hint */)) {
         std::vector<string> components =
             str_util::Split(params.prefix, "::", str_util::SkipEmpty());
-        prefix_end_ = components.back();
+        key_prefix_ = components.back();
       }
 
       ~ParallelInterleaveIterator() override {
@@ -237,27 +234,20 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
-        std::shared_ptr<InvocationResult> result;
-        do {
-          result.reset();
-          {
-            mutex_lock l(*mu_);
-            EnsureRunnerThreadStarted(ctx);
-            while (ShouldWait(&result)) {
-              RecordStop(ctx);
-              cond_var_->wait(l);
-              RecordStart(ctx);
-            }
-            if (!result) {
-              *end_of_sequence = true;
-              return Status::OK();
-            }
+        std::shared_ptr<Result> result;
+        {
+          mutex_lock l(*mu_);
+          EnsureThreadsStarted(ctx);
+          while (!Consume(&result)) {
+            RecordStop(ctx);
+            cond_var_->wait(l);
+            RecordStart(ctx);
           }
-          RecordStop(ctx);
-          result->notification.WaitForNotification();
-          RecordStart(ctx);
-        } while (result->skip);
-
+        }
+        if (!result) {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
         if (result->status.ok()) {
           *out_tensors = std::move(result->return_values);
           RecordBufferDequeue(ctx, *out_tensors);
@@ -281,37 +271,22 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
         while (num_calls_ > 0) {
           cond_var_->wait(l);
         }
-        CHECK_EQ(num_calls_, 0);
+        DCHECK_EQ(num_calls_, 0);
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name("invocation_results.size"), invocation_results_.size()));
-        for (size_t i = 0; i < invocation_results_.size(); i++) {
-          std::shared_ptr<InvocationResult> result = invocation_results_[i];
-          TF_RETURN_IF_ERROR(WriteStatusLocked(writer, i, result->status));
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat("invocation_results[", i, "].size")),
-              result->return_values.size()));
-          for (size_t j = 0; j < result->return_values.size(); j++) {
-            TF_RETURN_IF_ERROR(writer->WriteTensor(
-                full_name(
-                    strings::StrCat("invocation_results[", i, "][", j, "]")),
-                result->return_values[j]));
-          }
-          if (result->skip) {
-            TF_RETURN_IF_ERROR(writer->WriteScalar(
-                full_name(strings::StrCat("invocation_results[", i, "].skip")),
-                ""));
-          }
-        }
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("block_index"), block_index_));
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(full_name("cycle_index"), cycle_index_));
         if (end_of_input_) {
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name("end_of_input"), ""));
         }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("element_id_counter"),
+                                               element_id_counter_));
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(full_name("num_open"), num_open_));
         TF_RETURN_IF_ERROR(WriteCurrentElements(writer));
+        TF_RETURN_IF_ERROR(WriteFutureElements(writer));
         return Status::OK();
       }
 
@@ -319,265 +294,391 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
                              IteratorStateReader* reader) override {
         mutex_lock l(*mu_);
         TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        int64 invocation_results_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name("invocation_results.size"), &invocation_results_size));
-        for (size_t i = 0; i < invocation_results_size; i++) {
-          std::shared_ptr<InvocationResult> result(new InvocationResult());
-          invocation_results_.push_back(result);
-          TF_RETURN_IF_ERROR(ReadStatusLocked(reader, i, &result->status));
-          size_t num_return_values;
-          {
-            int64 size;
-            TF_RETURN_IF_ERROR(reader->ReadScalar(
-                full_name(strings::StrCat("invocation_results[", i, "].size")),
-                &size));
-            num_return_values = static_cast<size_t>(size);
-            if (num_return_values != size) {
-              return errors::InvalidArgument(strings::StrCat(
-                  full_name(
-                      strings::StrCat("invocation_results[", i, "].size")),
-                  ": ", size, " is not a valid value of type size_t."));
-            }
-          }
-          result->return_values.reserve(num_return_values);
-          for (size_t j = 0; j < num_return_values; j++) {
-            result->return_values.emplace_back();
-            TF_RETURN_IF_ERROR(
-                reader->ReadTensor(full_name(strings::StrCat(
-                                       "invocation_results[", i, "][", j, "]")),
-                                   &result->return_values.back()));
-          }
-          result->skip = reader->Contains(
-              full_name(strings::StrCat("invocation_results[", i, "].skip")));
-          result->notification.Notify();
-        }
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("block_index"), &block_index_));
         TF_RETURN_IF_ERROR(
             reader->ReadScalar(full_name("cycle_index"), &cycle_index_));
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("element_id_counter"),
+                                              &element_id_counter_));
         if (reader->Contains(full_name("end_of_input"))) end_of_input_ = true;
         TF_RETURN_IF_ERROR(
             reader->ReadScalar(full_name("num_open"), &num_open_));
         TF_RETURN_IF_ERROR(ReadCurrentElements(ctx, reader));
+        TF_RETURN_IF_ERROR(ReadFutureElements(ctx, reader));
         return Status::OK();
       }
 
      private:
+      // Represents the result of fetching an element from a dataset.
+      struct Result {
+        Status status;
+        std::vector<Tensor> return_values;
+        // Indicates whether the result is ready to be consumed.
+        bool is_ready = false;
+      };
+
+      // The interleave transformation repeatedly inputs elements, applies the
+      // user-provided function to transform the input elements to datasets, and
+      // interleaves the elements of these datasets as its output.
+      //
+      // This structure represents an input element and derived state.
       struct Element {
+        // Unique identifier, needed to support checkpointing.
+        int64 id;
+        // The actual input element.
+        std::vector<Tensor> inputs;
+        // Iterator created from the input element.
         std::unique_ptr<IteratorBase> iterator;
-        std::vector<Tensor> inputs;  // inputs for creating the iterator
-        bool in_use;
+        mutex mu;
+        // Buffer for storing the outputs of `iterator`.
+        std::deque<std::shared_ptr<Result>> results GUARDED_BY(mu);
+        // Indicates whether the element is used by a worker thread.
+        bool in_use = false;
       };
 
-      struct InvocationResult {
-        Notification notification;  // used for coordination with the consumer
-        Status status;              // the invocation status
-        std::vector<Tensor> return_values;  // the invocation result values
-        bool skip;  // if set the result should be skipped
-      };
+      // Advances the position in the interleave cycle to the next cycle
+      // element.
+      void AdvanceToNextInCycle() EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        block_index_ = 0;
+        cycle_index_ = (cycle_index_ + 1) % dataset()->cycle_length_;
+      }
 
-      void EnsureRunnerThreadStarted(IteratorContext* ctx)
+      // Advances the position in the interleave cycle by one.
+      void AdvancePosition() EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        ++block_index_;
+        if (block_index_ == dataset()->block_length_) {
+          AdvanceToNextInCycle();
+        }
+      }
+
+      // Consumes a result (if available), returning an indication of whether
+      // a result is available. If `true` is returned, `result` either
+      // points to a valid result or is null if end of input has been reached.
+      bool Consume(std::shared_ptr<Result>* result)
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
-        if (!runner_thread_) {
-          std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-          runner_thread_.reset(ctx->env()->StartThread(
-              {}, "tf_data_parallel_interleave_runner",
-              [this, new_ctx]() { RunnerThread(new_ctx); }));
+        if (!sloppy_) {
+          return ConsumeHelper(result);
         }
+        // If we are allowed to be sloppy (i.e. return results out of order),
+        // try to find an element in the cycle that has a result available.
+        for (int i = 0; i < dataset()->cycle_length_; ++i) {
+          if (ConsumeHelper(result)) {
+            return true;
+          }
+          AdvanceToNextInCycle();
+        }
+        return false;
       }
 
-      // Fetches up to `results.size()` outputs from the cycle element at
-      // position `cycle_index`.
+      bool ConsumeHelper(std::shared_ptr<Result>* result)
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        while (true) {
+          std::shared_ptr<Element> element = current_elements_[cycle_index_];
+          if (element) {
+            mutex_lock l(element->mu);
+            if (!element->results.empty()) {
+              if (element->results.front()->is_ready) {
+                // We found a result.
+                std::swap(*result, element->results.front());
+                element->results.pop_front();
+                AdvancePosition();
+                cond_var_->notify_all();
+                return true;
+              } else {
+                // Wait for the result to become ready.
+                return false;
+              }
+            } else if (!element->iterator) {
+              // We reached the end of input for this element. Reset
+              // it and move on to the next cycle element.
+              current_elements_[cycle_index_].reset();
+              AdvanceToNextInCycle();
+              cond_var_->notify_all();
+              continue;
+            } else {
+              // Wait for the iterator to produce a result.
+              return false;
+            }
+          } else {
+            if (!future_elements_.empty() || !end_of_input_) {
+              // Wait for an element to be created.
+              return false;
+            }
+            // No new elements will be created; try to find a
+            // non-empty element in the cycle.
+            for (int i = 0; i < dataset()->cycle_length_; ++i) {
+              AdvanceToNextInCycle();
+              if (current_elements_[cycle_index_]) {
+                break;
+              }
+            }
+            if (current_elements_[cycle_index_]) {
+              continue;
+            }
+            // End of input has been reached.
+            return true;
+          }
+        }
+      }
+
+      // Manages current cycle elements, creating new iterators as needed and
+      // asynchronously fetching results from existing iterators.
       //
-      // If end of input is encountered, the `skip` field of the invocation
-      // result is used to identify results that should be skipped.
-      void FetchOutputs(
-          const std::shared_ptr<IteratorContext>& ctx, IteratorBase* iterator,
-          int64 cycle_index,
-          const std::vector<std::shared_ptr<InvocationResult>>& results)
-          LOCKS_EXCLUDED(*mu_) {
+      // This method runs in the `current_elements_manager_` background thread.
+      void CurrentElementsManager(const std::shared_ptr<IteratorContext>& ctx) {
         RecordStart(ctx.get());
         auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
-        bool end_of_input = false;
-        for (auto& result : results) {
-          if (!end_of_input) {
-            result->status = iterator->GetNext(
-                ctx.get(), &result->return_values, &end_of_input);
+        auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(*mu_) -> bool {
+          const bool has_more_elements =
+              !future_elements_.empty() || !end_of_input_;
+          const int block_length = dataset()->block_length_;
+          bool all_elements_busy = true;
+          for (auto& element : current_elements_) {
+            if (!element) {
+              if (has_more_elements) {
+                all_elements_busy = false;
+                break;
+              }
+            } else {
+              mutex_lock l(element->mu);
+              if (!element->in_use && element->iterator &&
+                  element->results.size() < block_length) {
+                all_elements_busy = false;
+                break;
+              }
+            }
           }
-          if (end_of_input) {
-            result->skip = true;
+          return all_elements_busy || num_calls_ >= num_parallel_calls_->value;
+        };
+        while (true) {
+          mutex_lock l(*mu_);
+
+          // Wait until this thread is cancelled, the end of input has been
+          // reached.
+          while (!cancelled_ && (!end_of_input_ || num_open_ > 0) && busy()) {
+            RecordStop(ctx.get());
+            cond_var_->wait(l);
+            RecordStart(ctx.get());
           }
-          RecordBufferEnqueue(ctx.get(), result->return_values);
-          {
-            mutex_lock l(*mu_);
-            result->notification.Notify();
-            cond_var_->notify_all();
+
+          if (cancelled_ ||
+              (future_elements_.empty() && end_of_input_ && num_open_ == 0)) {
+            return;
+          }
+
+          for (int i = 0; i < dataset()->cycle_length_; ++i) {
+            int idx = (cycle_index_ + i) % dataset()->cycle_length_;
+            if (!current_elements_[idx]) {
+              if (!future_elements_.empty()) {
+                current_elements_[idx] = std::move(future_elements_.back());
+                future_elements_.pop_back();
+              } else {
+                current_elements_[idx] = MakeElement(ctx);
+                if (!current_elements_[idx]) {
+                  continue;
+                }
+              }
+            }
+            std::shared_ptr<Element> element = current_elements_[idx];
+            if (!element->in_use && element->iterator) {
+              int64 num_results;
+              {
+                mutex_lock l(element->mu);
+                num_results =
+                    dataset()->block_length_ - element->results.size();
+              }
+              if (num_results > 0) {
+                num_calls_++;
+                element->in_use = true;
+                thread_pool_->Schedule(
+                    std::bind(&ParallelInterleaveIterator::FetchResults, this,
+                              ctx, std::move(element), num_results));
+              }
+            }
           }
-          if (!result->status.ok()) {
+          const auto& stats_aggregator = ctx->stats_aggregator();
+          if (stats_aggregator) {
+            stats_aggregator->AddScalar(
+                strings::StrCat(key_prefix_, "::thread_utilization"),
+                static_cast<float>(num_calls_) /
+                    static_cast<float>(num_parallel_calls_->value));
+          }
+          cond_var_->notify_all();
+        }
+      }
+
+      void EnsureThreadsStarted(IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        if (!current_elements_manager_) {
+          auto new_ctx = std::make_shared<IteratorContext>(*ctx);
+          current_elements_manager_ =
+              WrapUnique<Thread>(ctx->env()->StartThread(
+                  {}, "tf_data_parallel_interleave_current",
+                  [this, new_ctx]() { CurrentElementsManager(new_ctx); }));
+        }
+        if (!future_elements_manager_) {
+          auto new_ctx = std::make_shared<IteratorContext>(*ctx);
+          future_elements_manager_ = WrapUnique<Thread>(ctx->env()->StartThread(
+              {}, "tf_data_parallel_interleave_future",
+              [this, new_ctx]() { FutureElementsManager(new_ctx); }));
+        }
+      }
+
+      // Fetches up to `dataset()->block_length_` results from `element`.
+      void FetchResults(const std::shared_ptr<IteratorContext>& ctx,
+                        const std::shared_ptr<Element>& element,
+                        int64 num_results) LOCKS_EXCLUDED(*mu_) {
+        RecordStart(ctx.get());
+        auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
+        bool end_of_input = false;
+        for (int64 i = 0; i < num_results; ++i) {
+          auto result = std::make_shared<Result>();
+          result->status = element->iterator->GetNext(
+              ctx.get(), &result->return_values, &end_of_input);
+          if (end_of_input) {
             break;
           }
+          RecordBufferEnqueue(ctx.get(), result->return_values);
+          mutex_lock l(*mu_);
+          mutex_lock l2(element->mu);
+          element->results.push_back(result);
+          result->is_ready = true;
+          cond_var_->notify_all();
         }
 
         mutex_lock l(*mu_);
-        current_elements_[cycle_index].in_use = false;
+        // Release the ownership of the cycle element iterator.
+        element->in_use = false;
         if (end_of_input) {
-          // Release the ownership of the cycle element iterator, closing the
-          // iterator if end of input was encountered.
-          current_elements_[cycle_index].iterator.reset();
-          current_elements_[cycle_index].inputs.clear();
-          num_open_--;
+          // Close the iterator if end of input was encountered.
+          element->iterator.reset();
+          element->inputs.clear();
+          --num_open_;
         }
-        num_calls_--;
+        --num_calls_;
         const auto& stats_aggregator = ctx->stats_aggregator();
         if (stats_aggregator) {
           stats_aggregator->AddScalar(
-              strings::StrCat(prefix_end_, "::active_parallel_calls"),
-              static_cast<float>(num_calls_));
+              strings::StrCat(key_prefix_, "::thread_utilization"),
+              static_cast<float>(num_calls_) /
+                  static_cast<float>(num_parallel_calls_->value));
         }
         cond_var_->notify_all();
       }
 
-      // Method responsible for 1) creating iterators out of input elements, 2)
-      // determining the order in which elements are fetched from the iterators,
-      // and 3) scheduling the fetching of the elements to a threadpool.
+      // Manages futures cycle elements, creating new iterators as needed and
+      // asynchronously fetching results from existing iterators.
       //
-      // This method runs in the `runner_thread` background thread.
-      void RunnerThread(const std::shared_ptr<IteratorContext>& ctx) {
+      // This method runs in the `future_elements_manager_` background thread.
+      void FutureElementsManager(const std::shared_ptr<IteratorContext>& ctx) {
         RecordStart(ctx.get());
         auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
         auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(*mu_) -> bool {
-          return current_elements_[cycle_index_].in_use ||
-                 num_calls_ >= num_parallel_calls_->value ||
-                 invocation_results_.size() >=
-                     dataset()->cycle_length_ * dataset()->block_length_;
+          return num_calls_ >= num_parallel_calls_->value ||
+                 future_elements_.size() >= dataset()->cycle_length_;
         };
         while (true) {
           mutex_lock l(*mu_);
+
           // Wait until this thread is cancelled, the end of input has been
           // reached, or the cycle element at the `cycle_index_` position is
-          // not in use and there is space in the `invocation_results_` queue.
-          while (!cancelled_ && (!end_of_input_ || num_open_ > 0) && busy()) {
+          // not in use.
+          while (!cancelled_ && !end_of_input_ && busy()) {
             RecordStop(ctx.get());
             cond_var_->wait(l);
             RecordStart(ctx.get());
           }
 
-          if (cancelled_ || (end_of_input_ && num_open_ == 0)) {
+          if (cancelled_ || end_of_input_) {
             return;
           }
 
-          while ((!end_of_input_ || num_open_ > 0) && !busy()) {
-            if (!current_elements_[cycle_index_].iterator) {
-              // Try to create a new iterator from the next input element.
-              Status status = input_impl_->GetNext(
-                  ctx.get(), &current_elements_[cycle_index_].inputs,
-                  &end_of_input_);
-              if (!status.ok()) {
-                invocation_results_.emplace_back(new InvocationResult());
-                std::shared_ptr<InvocationResult>& result =
-                    invocation_results_.back();
-                result->status.Update(status);
-                result->notification.Notify();
-                break;
-              }
-              if (!end_of_input_) {
-                Status status = MakeIteratorFromInputElement(
-                    ctx.get(), current_elements_[cycle_index_].inputs,
-                    cycle_index_, *instantiated_captured_func_, prefix(),
-                    &current_elements_[cycle_index_].iterator);
-                if (!status.ok()) {
-                  invocation_results_.emplace_back(new InvocationResult());
-                  std::shared_ptr<InvocationResult>& result =
-                      invocation_results_.back();
-                  result->status.Update(status);
-                  result->notification.Notify();
-                  break;
-                }
-                ++num_open_;
-              }
+          while (!end_of_input_ && !busy()) {
+            std::shared_ptr<Element> element = MakeElement(ctx);
+            if (!element) {
+              break;
             }
-            if (current_elements_[cycle_index_].iterator) {
-              // Pre-allocate invocation results for outputs to be fetched
-              // and then fetch the outputs asynchronously.
-              std::vector<std::shared_ptr<InvocationResult>> results;
-              results.reserve(dataset()->block_length_);
-              for (int i = 0; i < dataset()->block_length_; ++i) {
-                invocation_results_.emplace_back(new InvocationResult());
-                results.push_back(invocation_results_.back());
-              }
-              num_calls_++;
-              current_elements_[cycle_index_].in_use = true;
-              thread_pool_->Schedule(
-                  std::bind(&ParallelInterleaveIterator::FetchOutputs, this,
-                            ctx, current_elements_[cycle_index_].iterator.get(),
-                            cycle_index_, std::move(results)));
+            future_elements_.push_front(element);
+            if (!element->iterator) {
+              continue;
             }
-            cycle_index_ = (cycle_index_ + 1) % dataset()->cycle_length_;
+            ++num_calls_;
+            element->in_use = true;
+            thread_pool_->Schedule(
+                std::bind(&ParallelInterleaveIterator::FetchResults, this, ctx,
+                          std::move(element), dataset()->block_length_));
           }
           const auto& stats_aggregator = ctx->stats_aggregator();
           if (stats_aggregator) {
-            // TODO(shivaniagrawal): add `parallel_calls_utilization` in the
-            // monitoring code or as histogram at fixed time intervals.
             stats_aggregator->AddScalar(
-                strings::StrCat(prefix_end_, "::active_parallel_calls"),
-                static_cast<float>(num_calls_));
-            stats_aggregator->AddScalar(
-                strings::StrCat(prefix_end_, "::num_parallel_calls"),
-                static_cast<float>(num_parallel_calls_->value));
+                strings::StrCat(key_prefix_, "::thread_utilization"),
+                static_cast<float>(num_calls_) /
+                    static_cast<float>(num_parallel_calls_->value));
           }
           cond_var_->notify_all();
         }
       }
 
-      // Determines whether the caller needs to wait for a result. Upon
-      // returning false, `result` will either be NULL if end of input has been
-      // reached or point to the result.
-      bool ShouldWait(std::shared_ptr<InvocationResult>* result)
+      // Creates a new element.
+      std::shared_ptr<Element> MakeElement(
+          const std::shared_ptr<IteratorContext>& ctx)
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
-        if (sloppy_) {
-          for (auto it = invocation_results_.begin();
-               it != invocation_results_.end(); ++it) {
-            if ((*it)->notification.HasBeenNotified()) {
-              std::swap(*result, *it);
-              invocation_results_.erase(it);
-              cond_var_->notify_all();
-              return false;
-            }
+        auto element = std::make_shared<Element>();
+        element->id = element_id_counter_++;
+        Status status =
+            input_impl_->GetNext(ctx.get(), &element->inputs, &end_of_input_);
+        if (!status.ok()) {
+          auto result = std::make_shared<Result>();
+          result->is_ready = true;
+          result->status = status;
+          mutex_lock l(element->mu);
+          element->results.push_back(std::move(result));
+          return element;
+        }
+        if (!end_of_input_) {
+          Status status = MakeIteratorFromInputElement(
+              ctx.get(), element->inputs, element->id,
+              *instantiated_captured_func_, prefix(), &element->iterator);
+          if (!status.ok()) {
+            auto result = std::make_shared<Result>();
+            result->is_ready = true;
+            result->status = status;
+            mutex_lock l(element->mu);
+            element->results.push_back(std::move(result));
+            return element;
           }
-          return !invocation_results_.empty() ||
-                 (!end_of_input_ || num_open_ > 0);
+          ++num_open_;
         } else {
-          if (!invocation_results_.empty()) {
-            std::swap(*result, invocation_results_.front());
-            invocation_results_.pop_front();
-            cond_var_->notify_all();
-            return false;
-          }
-          return (!end_of_input_ || num_open_ > 0);
+          element.reset();
         }
+        return element;
       }
 
-      Status WriteStatusLocked(IteratorStateWriter* writer, size_t index,
+      Status WriteStatusLocked(IteratorStateWriter* writer,
+                               const string& key_prefix, size_t idx,
                                const Status& status)
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         TF_RETURN_IF_ERROR(writer->WriteScalar(
-            CodeKey(index), static_cast<int64>(status.code())));
+            CodeKey(key_prefix, idx), static_cast<int64>(status.code())));
         if (!status.ok()) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(ErrorMessageKey(index),
-                                                 status.error_message()));
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              ErrorMessageKey(key_prefix, idx), status.error_message()));
         }
         return Status::OK();
       }
 
-      Status ReadStatusLocked(IteratorStateReader* reader, size_t index,
+      Status ReadStatusLocked(IteratorStateReader* reader,
+                              const string& key_prefix, size_t idx,
                               Status* status) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         int64 code_int;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(CodeKey(index), &code_int));
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(CodeKey(key_prefix, idx), &code_int));
         error::Code code = static_cast<error::Code>(code_int);
 
         if (code != error::Code::OK) {
           string error_message;
-          TF_RETURN_IF_ERROR(
-              reader->ReadScalar(ErrorMessageKey(index), &error_message));
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              ErrorMessageKey(key_prefix, idx), &error_message));
           *status = Status(code, error_message);
         } else {
           *status = Status::OK();
@@ -585,64 +686,178 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      string CodeKey(size_t index) {
+      string CodeKey(const string& key_prefix, size_t idx) {
         return full_name(
-            strings::StrCat("invocation_results[", index, "].code"));
+            strings::StrCat(key_prefix, ".results[", idx, "].code"));
       }
 
-      string ErrorMessageKey(size_t index) {
+      string ErrorMessageKey(const string& key_prefix, size_t idx) {
         return full_name(
-            strings::StrCat("invocation_results[", index, "].error_message"));
+            strings::StrCat(key_prefix, ".results[", idx, "].error_message"));
+      }
+
+      Status WriteElement(std::shared_ptr<Element> element, int idx,
+                          const string& key_prefix, IteratorStateWriter* writer)
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        if (element->iterator) {
+          TF_RETURN_IF_ERROR(SaveInput(writer, element->iterator));
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(key_prefix, "[", idx, "].id")),
+              element->id));
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(key_prefix, "[", idx, "].inputs.size")),
+              element->inputs.size()));
+          for (int i = 0; i < element->inputs.size(); i++) {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(
+                    strings::StrCat(key_prefix, "[", idx, "].inputs[", i, "]")),
+                element->inputs[i]));
+          }
+        }
+        mutex_lock l(element->mu);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(key_prefix, "[", idx, "].results.size")),
+            element->results.size()));
+        for (size_t i = 0; i < element->results.size(); i++) {
+          std::shared_ptr<Result> result = element->results[i];
+          TF_RETURN_IF_ERROR(WriteStatusLocked(
+              writer, strings::StrCat(key_prefix, "[", idx, "]"), i,
+              result->status));
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(key_prefix, "[", idx, "].results[", i,
+                                        "].size")),
+              result->return_values.size()));
+          for (size_t j = 0; j < result->return_values.size(); j++) {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(strings::StrCat(key_prefix, "[", idx, "].results[", i,
+                                          "][", j, "]")),
+                result->return_values[j]));
+          }
+          if (result->is_ready) {
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                full_name(strings::StrCat(key_prefix, "[", idx, "].results[", i,
+                                          "].is_ready")),
+                ""));
+          }
+        }
+        return Status::OK();
       }
 
       Status WriteCurrentElements(IteratorStateWriter* writer)
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name("current_elements.size"), current_elements_.size()));
         for (int idx = 0; idx < current_elements_.size(); idx++) {
-          if (current_elements_[idx].iterator) {
-            TF_RETURN_IF_ERROR(
-                SaveInput(writer, current_elements_[idx].iterator));
-            TF_RETURN_IF_ERROR(writer->WriteScalar(
-                full_name(
-                    strings::StrCat("current_elements[", idx, "].inputs.size")),
-                current_elements_[idx].inputs.size()));
-            for (int i = 0; i < current_elements_[idx].inputs.size(); i++) {
-              TF_RETURN_IF_ERROR(writer->WriteTensor(
-                  full_name(strings::StrCat("current_elements[", idx,
-                                            "].inputs[", i, "]")),
-                  current_elements_[idx].inputs[i]));
-            }
+          if (current_elements_[idx]) {
+            TF_RETURN_IF_ERROR(WriteElement(current_elements_[idx], idx,
+                                            "current_elements", writer));
           }
         }
         return Status::OK();
       }
 
+      Status WriteFutureElements(IteratorStateWriter* writer)
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name("future_elements.size"), future_elements_.size()));
+        for (int idx = 0; idx < future_elements_.size(); idx++) {
+          if (future_elements_[idx]) {
+            TF_RETURN_IF_ERROR(WriteElement(future_elements_[idx], idx,
+                                            "future_elements", writer));
+          }
+        }
+        return Status::OK();
+      }
+
+      Status ReadElement(IteratorContext* ctx, IteratorStateReader* reader,
+                         int idx, const string& key_prefix,
+                         std::shared_ptr<Element>* out)
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        if (!reader->Contains(full_name(
+                strings::StrCat(key_prefix, "[", idx, "].results.size")))) {
+          return Status::OK();
+        }
+        auto element = std::make_shared<Element>();
+        mutex_lock l(element->mu);
+        int64 results_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(key_prefix, "[", idx, "].results.size")),
+            &results_size));
+        element->results.resize(results_size);
+        for (size_t i = 0; i < results_size; i++) {
+          auto result = std::make_shared<Result>();
+          TF_RETURN_IF_ERROR(ReadStatusLocked(
+              reader, strings::StrCat(key_prefix, "[", idx, "]"), i,
+              &result->status));
+          int64 num_return_values;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat(key_prefix, "[", idx, "].results[", i,
+                                        "].size")),
+              &num_return_values));
+          result->return_values.reserve(num_return_values);
+          for (size_t j = 0; j < num_return_values; j++) {
+            result->return_values.emplace_back();
+            TF_RETURN_IF_ERROR(reader->ReadTensor(
+                full_name(strings::StrCat(key_prefix, "[", idx, "].results[", i,
+                                          "][", j, "]")),
+                &result->return_values.back()));
+          }
+          result->is_ready = reader->Contains(full_name(strings::StrCat(
+              key_prefix, "[", idx, "].results[", i, "].is_ready")));
+          element->results[i] = std::move(result);
+        }
+        if (!reader->Contains(full_name(
+                strings::StrCat(key_prefix, "[", idx, "].inputs.size")))) {
+          element->iterator.reset();
+          *out = std::move(element);
+          return Status::OK();
+        }
+        int64 inputs_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(key_prefix, "[", idx, "].inputs.size")),
+            &inputs_size));
+        element->inputs.resize(inputs_size);
+        for (int i = 0; i < inputs_size; i++) {
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              full_name(
+                  strings::StrCat(key_prefix, "[", idx, "].inputs[", i, "]")),
+              &element->inputs[i]));
+        }
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(key_prefix, "[", idx, "].id")),
+            &element->id));
+        TF_RETURN_IF_ERROR(MakeIteratorFromInputElement(
+            ctx, element->inputs, element->id,
+            *instantiated_captured_func_.get(), prefix(), &element->iterator));
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, element->iterator));
+        *out = std::move(element);
+        return Status::OK();
+      }
+
       Status ReadCurrentElements(IteratorContext* ctx,
                                  IteratorStateReader* reader)
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        int64 size;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("current_elements.size"), &size));
+        DCHECK_EQ(current_elements_.size(), size);
         for (int idx = 0; idx < current_elements_.size(); idx++) {
-          if (reader->Contains(full_name(strings::StrCat(
-                  "current_elements[", idx, "].inputs.size")))) {
-            int64 inputs_size;
-            TF_RETURN_IF_ERROR(reader->ReadScalar(
-                full_name(
-                    strings::StrCat("current_elements[", idx, "].inputs.size")),
-                &inputs_size));
-            current_elements_[idx].inputs.resize(inputs_size);
-            for (int i = 0; i < inputs_size; i++) {
-              TF_RETURN_IF_ERROR(reader->ReadTensor(
-                  full_name(strings::StrCat("current_elements[", idx,
-                                            "].inputs[", i, "]")),
-                  &current_elements_[idx].inputs[i]));
-            }
-            TF_RETURN_IF_ERROR(MakeIteratorFromInputElement(
-                ctx, current_elements_[idx].inputs, idx,
-                *instantiated_captured_func_.get(), prefix(),
-                &current_elements_[idx].iterator));
-            TF_RETURN_IF_ERROR(
-                RestoreInput(ctx, reader, current_elements_[idx].iterator));
-          } else {
-            current_elements_[idx].iterator.reset();
-          }
+          TF_RETURN_IF_ERROR(ReadElement(ctx, reader, idx, "current_elements",
+                                         &current_elements_[idx]));
+        }
+        return Status::OK();
+      }
+
+      Status ReadFutureElements(IteratorContext* ctx,
+                                IteratorStateReader* reader)
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        int64 size;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("future_elements.size"), &size));
+        future_elements_.resize(size);
+        for (int idx = 0; idx < future_elements_.size(); idx++) {
+          TF_RETURN_IF_ERROR(ReadElement(ctx, reader, idx, "future_elements",
+                                         &future_elements_[idx]));
         }
         return Status::OK();
       }
@@ -651,12 +866,11 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       // the worker threads.
       const std::shared_ptr<mutex> mu_;
 
-      // Used for coordination between the main thread, the runner thread, and
-      // the worker threads. In particular, the runner thread should only
-      // schedule new calls when the number of in-flight calls is less than the
-      // user specified level of parallelism, there are slots available in the
-      // `invocation_results_` buffer, the current cycle element is not in use,
-      // and there are elements left to be fetched.
+      // Used for coordination between the main thread, the manager threads, and
+      // the threadpool threads. In particular, the managers thread should only
+      // schedule new calls into the threadpool when the number of in-flight
+      // calls is less than the user specified level of parallelism and there
+      // are slots available in the element `results` buffer.
       const std::shared_ptr<condition_variable> cond_var_;
 
       // Identifies the maximum number of parallel calls.
@@ -668,18 +882,17 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       // Iterator for input elements.
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(*mu_);
 
-      // Identifies current cycle element.
-      int64 cycle_index_ = 0;
+      // Identifies position in the interleave cycle.
+      int64 block_index_ GUARDED_BY(*mu_) = 0;
+      int64 cycle_index_ GUARDED_BY(*mu_) = 0;
 
-      // Iterators for the current cycle elements. Concurrent access is
-      // protected by `element_in_use_`.
-      std::vector<Element> current_elements_ GUARDED_BY(*mu_);
+      // Elements of the current interleave cycle.
+      std::vector<std::shared_ptr<Element>> current_elements_ GUARDED_BY(*mu_);
 
-      // Buffer for storing the invocation results.
-      std::deque<std::shared_ptr<InvocationResult>> invocation_results_
-          GUARDED_BY(*mu_);
+      // Elements to be used in the interleave cycle in the future.
+      std::deque<std::shared_ptr<Element>> future_elements_ GUARDED_BY(*mu_);
 
-      // Identifies whether end of input has been reached.
+      // Identifies whether the global end of input has been reached.
       bool end_of_input_ GUARDED_BY(*mu_) = false;
 
       // Identifies the number of open iterators.
@@ -689,11 +902,13 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       int64 num_calls_ GUARDED_BY(*mu_) = 0;
 
       std::unique_ptr<thread::ThreadPool> thread_pool_;
-      std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
+      std::unique_ptr<Thread> current_elements_manager_ GUARDED_BY(*mu_);
+      std::unique_ptr<Thread> future_elements_manager_ GUARDED_BY(*mu_);
+      int64 element_id_counter_ GUARDED_BY(*mu_) = 0;
 
-      // Identifies whether background activity should be cancelled.
+      // Identifies whether background threads should be cancelled.
       bool cancelled_ GUARDED_BY(*mu_) = false;
-      string prefix_end_;
+      string key_prefix_;
       std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc
index b62e7059bab42d7ace20c3fe9d681e2c129b926e..9a4ce981d07ec39fcaedb83cee588104da97bcc6 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.cc
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc
@@ -60,7 +60,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
         preserve_cardinality_(params.preserve_cardinality) {
     std::vector<string> components =
         str_util::Split(base_params.prefix, "::", str_util::SkipEmpty());
-    prefix_end_ = components.back();
+    key_prefix_ = components.back();
   }
 
   ~ParallelMapIterator() override {
@@ -207,8 +207,9 @@ class ParallelMapIterator : public DatasetBaseIterator {
     const auto& stats_aggregator = ctx->stats_aggregator();
     if (stats_aggregator) {
       stats_aggregator->AddScalar(
-          strings::StrCat(prefix_end_, "::active_parallel_calls"),
-          static_cast<float>(num_calls_));
+          strings::StrCat(key_prefix_, "::thread_utilization"),
+          static_cast<float>(num_calls_) /
+              static_cast<float>(num_parallel_calls_->value));
     }
     RecordBufferEnqueue(ctx.get(), result->return_values);
     result->notification.Notify();
@@ -300,14 +301,10 @@ class ParallelMapIterator : public DatasetBaseIterator {
         }
         const auto& stats_aggregator = ctx->stats_aggregator();
         if (stats_aggregator) {
-          // TODO(shivaniagrawal): add `parallel_calls_utilization` in the
-          // monitoring code or as histogram at fixed time intervals.
-          stats_aggregator->AddScalar(
-              strings::StrCat(prefix_end_, "::active_parallel_calls"),
-              static_cast<float>(num_calls_));
           stats_aggregator->AddScalar(
-              strings::StrCat(prefix_end_, "::num_parallel_calls"),
-              static_cast<float>(num_parallel_calls_->value));
+              strings::StrCat(key_prefix_, "::thread_utilization"),
+              static_cast<float>(num_calls_) /
+                  static_cast<float>(num_parallel_calls_->value));
         }
         cond_var_->notify_all();
       }
@@ -403,7 +400,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
       GUARDED_BY(*mu_);
   std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
   bool cancelled_ GUARDED_BY(*mu_) = false;
-  string prefix_end_;
+  string key_prefix_;
 };
 
 }  // namespace
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index db0cc6fa4db2af07b3906e7daaf1ff0e3690dd15..4c380c1fa2ce143f6cdff2a5e05708d90c4b750f 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -412,7 +412,7 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
             parent_generator_(seed, seed2),
             generator_(&parent_generator_) {}
 
-      string DebugString() override {
+      string DebugString() const override {
         return "ReshufflingDataset::RandomSeedGenerator";
       }
 
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index da3bdb475e274d73751e22334628e3431023b9e4..c152f2b7e4125687f1b670fae374e2a747cd902c 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -633,7 +633,8 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
       // conv is supported.
       launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop,
                 reshaped_filter, /*row_dilation=*/1, /*col_dilation=*/1,
-                stride_, stride_, padding_, in_backprop, data_format_);
+                stride_, stride_, padding_, /*explicit_paddings=*/{},
+                in_backprop, data_format_);
       return;
     }
 
@@ -1115,7 +1116,8 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
       // conv is supported.
       launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop, input,
                 /*row_dilation=*/1, /*col_dilation=*/1, stride_, stride_,
-                padding_, &reshaped_filter, data_format_);
+                padding_, /*explicit_paddings=*/{}, &reshaped_filter,
+                data_format_);
       return;
     }
 
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index f0902fdba6921b46fd7a0d0adb16e470ed83f65c..dacd3cfea8e71cdacf767ae64920c393f68278a3 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -404,7 +404,8 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
       // conv is supported.
       launcher_(context, use_cudnn_, cudnn_use_autotune_, input,
                 reshaped_filter, /*row_dilation=*/1, /*col_dilation=*/1,
-                stride_, stride_, padding_, output, data_format_);
+                stride_, stride_, padding_, /*explicit_paddings=*/{}, output,
+                data_format_);
       return;
     }
 
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.h
similarity index 98%
rename from tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
rename to tensorflow/core/kernels/depthwise_conv_op_gpu.h
index e811968d277ba3594341a59e8d6262cac637e602..098853e68430d425143d16ff2e8edbb9877f8e23 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_GPU_H_
+
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
 
@@ -38,7 +41,7 @@ using Eigen::GpuDevice;
 
 // Returns whether depthwise convolution forward or backward input pass can be
 // performed using the faster ('Small') variant of the kernel.
-EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dGPUSmall(
+inline EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dGPUSmall(
     const DepthwiseArgs& args) {
   return args.depth_multiplier == 1 && args.stride == 1 && args.in_rows <= 32 &&
          args.in_cols <= 32 && args.in_rows == args.out_rows &&
@@ -51,7 +54,7 @@ EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dGPUSmall(
 
 // Returns whether depthwise convolution backward filter pass can be performed
 // using the faster ('Small') variant of the kernel.
-EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(
+inline EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(
     const DepthwiseArgs& args, const int block_height) {
   return args.depth_multiplier == 1 && args.stride == 1 && args.in_rows <= 32 &&
          args.in_cols <= 32 && args.in_rows == args.out_rows &&
@@ -652,13 +655,12 @@ struct PseudoHalfType<Eigen::half> {
 };
 }  // namespace detail
 
-namespace {
 // Maps to float if T is __half, and to T otherwise.
 template <typename T>
 using PseudoHalfType = typename detail::PseudoHalfType<T>::Type;
 
 // Returns whether the context's GPU supports efficient fp16 math.
-bool HasFastHalfMath(OpKernelContext* ctx) {
+inline bool HasFastHalfMath(OpKernelContext* ctx) {
   int major, minor;
   ctx->op_device_context()
       ->stream()
@@ -669,7 +671,6 @@ bool HasFastHalfMath(OpKernelContext* ctx) {
   // GPUs before sm_53 don't support fp16 math, and sm_61's fp16 math is slow.
   return cuda_arch >= 530 && cuda_arch != 610;
 }
-}  // namespace
 
 template <typename T, DepthwiseConv2dDirection kDirection,
           int kKnownFilterWidth, int kKnownFilterHeight, int kBlockDepth,
@@ -808,10 +809,6 @@ void LaunchDepthwiseConvOp<GpuDevice, T>::operator()(OpKernelContext* ctx,
   }
 }
 
-template struct LaunchDepthwiseConvOp<GpuDevice, Eigen::half>;
-template struct LaunchDepthwiseConvOp<GpuDevice, float>;
-template struct LaunchDepthwiseConvOp<GpuDevice, double>;
-
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. input.
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kKnownDepthMultiplier>
@@ -1030,10 +1027,6 @@ void LaunchDepthwiseConvBackpropInputOp<GpuDevice, T>::operator()(
   }
 }
 
-template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, Eigen::half>;
-template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, float>;
-template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, double>;
-
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kKnownDepthMultiplier>
@@ -1803,9 +1796,7 @@ void LaunchDepthwiseConvBackpropFilterOp<GpuDevice, T>::operator()(
                  ctx, args, out_backprop, input, filter_backprop, data_format));
   }
 }
-
-template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, Eigen::half>;
-template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, float>;
-template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, double>;
 }  // namespace tensorflow
 #endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_GPU_H_
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu_double.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu_double.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..073e7cf269844a7b355019493dad3d9287c00bf5
--- /dev/null
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu_double.cu.cc
@@ -0,0 +1,30 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/depthwise_conv_op.h"
+#include "tensorflow/core/kernels/depthwise_conv_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct LaunchDepthwiseConvOp<GpuDevice, double>;
+template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, double>;
+template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, double>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu_float.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu_float.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4b0e15e4766713130e86224dc9f255fe8ecead81
--- /dev/null
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu_float.cu.cc
@@ -0,0 +1,30 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/depthwise_conv_op.h"
+#include "tensorflow/core/kernels/depthwise_conv_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct LaunchDepthwiseConvOp<GpuDevice, float>;
+template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, float>;
+template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, float>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu_half.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu_half.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2db9fa4dff5bf58cb52d44c3c044ba4fc34d6d9f
--- /dev/null
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu_half.cu.cc
@@ -0,0 +1,30 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/depthwise_conv_op.h"
+#include "tensorflow/core/kernels/depthwise_conv_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct LaunchDepthwiseConvOp<GpuDevice, Eigen::half>;
+template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, Eigen::half>;
+template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, Eigen::half>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/eigen_contraction_kernel.h b/tensorflow/core/kernels/eigen_contraction_kernel.h
index 3d8e52ca0e49828b54604f7c5107f5dfd05d6891..05a3ae07a1137078806301c81727fdfeb8429918 100644
--- a/tensorflow/core/kernels/eigen_contraction_kernel.h
+++ b/tensorflow/core/kernels/eigen_contraction_kernel.h
@@ -179,6 +179,9 @@ class TensorContractionBlocking<float, float, float, StorageIndex,
                                                      num_threads);
     }
 
+    // If dimensions do not pass basic sanity checks return immediately.
+    if (kc_ <= 0 || mc_ <= 0 || nc_ <= 0) return;
+
     // If we are using default Eigen gebp kernel there is no need to adjust the
     // block sizes for MKL-DNN.
     if (!UseCustomContractionKernels()) return;
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h
index 86d8c98ee65aebb2927b338dfb236f470a3a1d39..8b198139400a6d2ce2795f9ef0b5793114a78e0b 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h
@@ -1683,8 +1683,6 @@ EIGEN_DEVICE_FUNC
     kernel_dims[0] = kernelChannels * kernelRows * kernelCols;
     kernel_dims[1] = kernelFilters;
   }
-  // TODO(yangke): choose() is defined in TensorContraction.h -- consider
-  // moving it to somewhere more "common".
   return choose(
       Cond<internal::traits<Input>::Layout == ColMajor>(),
       kernel.reshape(kernel_dims)
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
index 22f71d62602cc984c0337f728298f7483c35bed9..03002adec4740090d8ea65f31f88a73e1a565310 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/eigen_spatial_convolutions.h"
+
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/kernels/eigen_cuboid_convolution.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -1540,22 +1542,187 @@ static void PackRhsHelper(int iters,
     pack_rhs(packed.data() + packed_offset, sub_mapper, depth, cols);
   }
   tensorflow::testing::StopTiming();
+  tensorflow::testing::SetLabel(
+      absl::StrCat("patch: ", patch_rows, "x", patch_cols, " D", patch_depth,
+                   "; num_patches=", num_patches, " patch_size=", patch_size,
+                   " num_inputs=", num_inputs));
+}
+
+static void PackLhsHelper(int iters,
+                          /* Input dimensions: */
+                          int input_depth,
+                          /* Filter (kernel) dimensions: */
+                          int filter_count, int filter_cols, int filter_rows,
+                          /* Block dimensions: */
+                          Index block_rows, Index block_cols) {
+  // Set random seed for benchmark repeatability.
+  srand(12345);
+
+  eigen_assert(block_rows <= filter_count);
+  eigen_assert(block_cols <= input_depth * filter_rows * filter_cols);
+
+  tensorflow::testing::UseRealTime();
+  tensorflow::testing::StopTiming();
+
+  using Dimensions = Eigen::DSizes<Eigen::Index, 4>;
+
+  // Default Eigen::Tensor layout is column major, so we configure dimensions
+  // starting from the inner most (`filter count` aka `kernel filers`).
+  Dimensions filter_dims(filter_count, filter_rows, filter_cols, input_depth);
+
+  static const int packet_size = Eigen::internal::packet_traits<float>::size;
+
+  // We are going to reshape filter into 2D tensor.
+  using NewDimension = Eigen::DSizes<Index, 2>;
+
+  // Contraction dimensions.
+  using nocontract_t = Eigen::array<Eigen::Index, 1>;
+  using contract_t = Eigen::array<Eigen::Index, 1>;
+
+  // Input to the ReshapeOp. It is the tensorflow TTypes<float>::Tensor
+  // with ColMajor layout, instead of RowMajor. But that doesn't make any
+  // difference, because TensorContraction swaps LHS with RHS for row major
+  // inputs, and contraction mapper always works with column major data.
+  using ArgType = TensorMap<Tensor<float, 4>, Eigen::Aligned>;
+
+  using Evaluator =
+      TensorEvaluator<const TensorReshapingOp<NewDimension, ArgType>,
+                      Eigen::DefaultDevice>;
+
+  using InputMapper = Eigen::internal::TensorContractionInputMapper<
+      float, Index, Eigen::internal::Lhs, Evaluator,  //
+      nocontract_t, contract_t,                       //
+      packet_size,                                    //
+      /*inner_dim_contiguous*/ true,                  //
+      /*inner_dim_reordered*/ false,                  //
+      /*Alignment*/ 0>;
+
+  using SubMapper = Eigen::internal::TensorContractionSubMapper<
+      float, Index, Eigen::internal::Lhs, Evaluator,  //
+      nocontract_t, contract_t,                       //
+      packet_size,                                    //
+      /*inner_dim_contiguous*/ true,                  //
+      /*inner_dim_reordered*/ false,                  //
+      /*Alignment*/ 0>;
+
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+  using PackLhsImpl = Eigen::internal::mkldnn_gemm_pack<float, Eigen::Index,
+                                                        SubMapper, ColMajor>;
+#else
+  using PackLhsImpl =
+      Eigen::internal::gemm_pack_lhs<float, Eigen::Index, SubMapper,      //
+                                     Traits::mr,                          //
+                                     Traits::LhsProgress,                 //
+                                     typename Traits::LhsPacket4Packing,  //
+                                     ColMajor>;
+#endif
+
+  Eigen::DefaultDevice device;
 
-  std::ostringstream stringStream;
-  stringStream << "patch: " << patch_rows << "x" << patch_cols << " D"
-               << patch_depth << "; num_patches=" << num_patches
-               << " patch_size=" << patch_size << " num_inputs=" << num_inputs;
-  tensorflow::testing::SetLabel(stringStream.str());
+  // We will reshape kernel into 2D tensor.
+  NewDimension reshape_dims;
+  reshape_dims[0] = filter_count;
+  reshape_dims[1] = input_depth * filter_rows * filter_cols;
+
+  // We are going to contract along the 'in_depth * filter_rows * filter_cols`.
+  nocontract_t nocontract_dim = {0};
+  contract_t contract_dim = {1};
+
+  // These values computed using the algorithm in TensorContraction.h, with
+  // 'nocontract_dim' and 'contract_dim' values specified above.
+  nocontract_t nocontract_strides = {1};
+  contract_t contract_strides = {filter_count};
+  nocontract_t i_strides = {1};
+  contract_t k_strides = {1};
+
+  // We use tensor of the same dimensions to store packed data.
+  Tensor<float, 4> packed(filter_dims);
+
+  // We generate multiple filter tensors, around 512mb in total size to measure
+  // realistic workload when input data in not in L1-L3 cache.
+  size_t input_bytes = filter_dims.TotalSize() * sizeof(float);
+  size_t mem_size_bytes = 1024 * 1024 * 512;
+  size_t num_filters =
+      std::max(static_cast<size_t>(1), mem_size_bytes / input_bytes);
+
+  std::vector<Tensor<float, 4>> filters;
+  std::vector<Evaluator> evaluators;
+  std::vector<InputMapper> input_mappers;
+
+  for (int i = 0; i < num_filters; ++i) {
+    filters.emplace_back(filter_dims);
+    filters[i].setRandom();
+
+    ArgType tensor_map(filters[i].data(), filter_dims);
+
+    const auto reshape_op =
+        TensorReshapingOp<NewDimension, ArgType>(tensor_map, reshape_dims);
+
+    evaluators.emplace_back(reshape_op, device);
+
+    input_mappers.emplace_back(evaluators[i], nocontract_strides, i_strides,
+                               contract_strides, k_strides);
+  }
+
+  PackLhsImpl pack_lhs;
+
+  const Index packed_total_size = filter_dims.TotalSize();
+
+  // Round up row/col/memory offsets to make them multiple of packet size.
+  const auto round_up = [](const Index idx) {
+    return (idx / packet_size) * packet_size;
+  };
+
+  // Block rows is in the [0, filter_count) range.
+  // Block cols is in the [0, filter_rows * filter_cols * input_depth) range.
+
+  const Index max_row = filter_count;
+  const Index max_col = filter_rows * filter_cols * input_depth;
+
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    int filter_idx =
+        num_filters == 1 ? 1 : internal::random<int>(0, num_filters - 1);
+
+    Index row_offset = round_up(internal::random<Index>(0, max_row - 10));
+    Index col_offset = round_up(internal::random<Index>(0, max_col - 10));
+
+    Index rows = std::min(block_rows, max_row - row_offset);
+    Index cols = std::min(block_cols, max_col - col_offset);
+
+    // Write packed data to random memory location to emulate cold caches.
+    Index packed_offset = round_up(
+        internal::random<Index>(0, packed_total_size - rows * cols - 1));
+
+    SubMapper sub_mapper =
+        input_mappers[filter_idx].getSubMapper(row_offset, col_offset);
+
+    // NOTE: Eigen gemm_pack_lhs accepts contraction depth (k-th dimension) as a
+    // first argument (aka block cols). MKL-DNN pack is generic for lhs and rhs
+    // and accepts block rows and cols in the same order for lhs and rhs.
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+    pack_lhs(packed.data() + packed_offset, sub_mapper, rows, cols);
+#else
+    pack_lhs(packed.data() + packed_offset, sub_mapper, cols, rows);
+#endif
+  }
+  tensorflow::testing::StopTiming();
+  tensorflow::testing::SetLabel(absl::StrCat(
+      "filter: count=", filter_count, " dims=", filter_rows, "x", filter_cols,
+      "; input: depth=", input_depth, "; num_filers=", num_filters));
 }
 
 // -------------------------------------------------------------------------- //
-// Macro argumentnames:
+// Pack RHS
+//
+// Macro argument names:
 //    N: batch size
 //    H: height
 //    W: width
 //    C: input channels
 //   FC: filter channles
 //   FH: filter height
+//   FW: filter width
 //   SH: stride in height dimensions
 //   SW: stride in width dimensions
 //   BR: block rows
@@ -1563,16 +1730,16 @@ static void PackRhsHelper(int iters,
 
 #define BM_CONCAT(a, b) a##b
 
-#define BM_NAME(prefix, N, H, W, C, FC, FH, FW, SH, SW, BR, BC)           \
+#define BM_RHS_NAME(prefix, N, H, W, C, FC, FH, FW, SH, SW, BR, BC)       \
   BM_CONCAT(BM_##prefix##_##N##_##H##x##W##_IC##C##_FC##FC##_##FH##x##FW, \
             _s##SH##x##SW##_B##BR##x##BC)
 
-#define BM_PackRhs(N, H, W, C, FC, FH, FW, SH, SW, BR, BC)         \
-  static void BM_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW, BR, \
-                      BC)(int iters) {                             \
-    PackRhsHelper(iters, N, H, W, C, FC, FH, FW, SH, SW, BR, BC);  \
-  }                                                                \
-  BENCHMARK(BM_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW, BR, BC))
+#define BM_PackRhs(N, H, W, C, FC, FH, FW, SH, SW, BR, BC)             \
+  static void BM_RHS_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW, BR, \
+                          BC)(int iters) {                             \
+    PackRhsHelper(iters, N, H, W, C, FC, FH, FW, SH, SW, BR, BC);      \
+  }                                                                    \
+  BENCHMARK(BM_RHS_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW, BR, BC))
 
 // Number of input channel (input depth) it equal to the number of patch
 // channels (patch depth).
@@ -1645,4 +1812,37 @@ BM_PackRhs(/*batch*/ 32,        //
            /*filter*/ 3, 3,     //
            /*stride*/ 2, 2,     //
            /*block*/ 36, 432);
+
+// -------------------------------------------------------------------------- //
+// Pack LHS
+//
+// Macro argument names:
+//    C: input channels
+//   FC: filter channels
+//   FH: filter height
+//   FW: filter width
+//   BR: block rows
+//   BC: block cols
+
+#define BM_LHS_NAME(prefix, C, FC, FH, FW, BR, BC) \
+  BM_CONCAT(BM_##prefix##_##C##_FC##FC##_##FH##x##FW, _B##BR##x##BC)
+
+#define BM_PackLhs(C, FC, FH, FW, BR, BC)                              \
+  static void BM_LHS_NAME(PackLhs, C, FC, FH, FW, BR, BC)(int iters) { \
+    PackLhsHelper(iters, C, FC, FH, FW, BR, BC);                       \
+  }                                                                    \
+  BENCHMARK(BM_LHS_NAME(PackLhs, C, FC, FH, FW, BR, BC))
+
+// Number of input channel (input depth) it equal to the number of patch
+// channels (patch depth).
+
+BM_PackLhs(/*input channels*/ 128,    //
+           /*filter channels*/ 1024,  //
+           /*filter dims*/ 3, 3,      //
+           /*block*/ 256, 56);
+
+BM_PackLhs(/*input channels*/ 128,    //
+           /*filter channels*/ 1024,  //
+           /*filter dims*/ 3, 3,      //
+           /*block*/ 56, 256);
 }  // namespace Eigen
diff --git a/tensorflow/core/kernels/fifo_queue.h b/tensorflow/core/kernels/fifo_queue.h
index 697ee81c39b194e29c03f3583f0aa727778ef316..4d3a7c197125613c662c97044d6964695ab92b0e 100644
--- a/tensorflow/core/kernels/fifo_queue.h
+++ b/tensorflow/core/kernels/fifo_queue.h
@@ -49,7 +49,7 @@ class FIFOQueue : public TypedQueue<std::deque<PersistentTensor> > {
                       CallbackWithTuple callback) override;
   Status MatchesNodeDef(const NodeDef& node_def) override;
 
-  int32 size() override {
+  int32 size() const override {
     mutex_lock lock(mu_);
     return queues_[0].size();
   }
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index 7090417dfdb2d7e433025b1a0f1cdeb5eece10a8..9c4c0487f09dff86efa833475ea685c30b1ac915 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -51,6 +51,11 @@ DEFINE_SETZERO_CPU(uint16);
 DEFINE_SETZERO_CPU(int16);
 DEFINE_SETZERO_CPU(int32);
 DEFINE_SETZERO_CPU(int64);
+DEFINE_SETZERO_CPU(quint8);
+DEFINE_SETZERO_CPU(qint8);
+DEFINE_SETZERO_CPU(quint16);
+DEFINE_SETZERO_CPU(qint16);
+DEFINE_SETZERO_CPU(qint32);
 DEFINE_SETZERO_CPU(complex64);
 DEFINE_SETZERO_CPU(complex128);
 DEFINE_SETZERO_CPU(Variant);
diff --git a/tensorflow/core/kernels/fuzzing/BUILD b/tensorflow/core/kernels/fuzzing/BUILD
index c9f025a5b051fcb0b20b12be57412f2c08e230d1..3c3e9bfa2e0a6f3f94c9c679994021929f9df489 100644
--- a/tensorflow/core/kernels/fuzzing/BUILD
+++ b/tensorflow/core/kernels/fuzzing/BUILD
@@ -8,11 +8,8 @@ cc_library(
     name = "fuzz_session",
     hdrs = ["fuzz_session.h"],
     deps = [
-        "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:scope",
         "//tensorflow/core:core_cpu",
-        "//tensorflow/core:direct_session",
-        "//tensorflow/core:ops",
         "//tensorflow/core:tensorflow",
     ],
 )
diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc
index f3684ffa2053cb59fa4c9aee9ab118595c3399b0..5859c20d89e06a9684c0abe4199c5d84778fc26f 100644
--- a/tensorflow/core/kernels/list_kernels.cc
+++ b/tensorflow/core/kernels/list_kernels.cc
@@ -259,14 +259,21 @@ class TensorListPushBack : public OpKernel {
                                   " max_num_elements: ", l->max_num_elements));
     }
 
-    TensorList output;
-    output = *l;
-    output.tensors.push_back(input);
-    Tensor* result;
     AllocatorAttributes attr;
     attr.set_on_host(true);
-    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
-    result->scalar<Variant>()() = std::move(output);
+    std::unique_ptr<Tensor> maybe_result = c->forward_input(
+        0, 0, DT_VARIANT, TensorShape{}, c->input_memory_type(0), attr);
+    if (maybe_result != nullptr) {
+      maybe_result->scalar<Variant>()().get<TensorList>()->tensors.push_back(
+          input);
+    } else {
+      Tensor* result;
+      OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
+      TensorList output;
+      output = *l;
+      output.tensors.push_back(input);
+      result->scalar<Variant>()() = std::move(output);
+    }
   }
 
  private:
@@ -384,14 +391,20 @@ class TensorListPopBack : public OpKernel {
                 errors::InvalidArgument("Trying to pop from an empty list."));
 
     c->set_output(1, l->tensors.back());
-    TensorList output;
-    output = *l;
-    output.tensors.pop_back();
-    Tensor* result;
     AllocatorAttributes attr;
     attr.set_on_host(true);
-    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
-    result->scalar<Variant>()() = std::move(output);
+    std::unique_ptr<Tensor> maybe_result = c->forward_input(
+        0, 0, DT_VARIANT, TensorShape{}, c->input_memory_type(0), attr);
+    if (maybe_result != nullptr) {
+      maybe_result->scalar<Variant>()().get<TensorList>()->tensors.pop_back();
+    } else {
+      TensorList output;
+      output = *l;
+      output.tensors.pop_back();
+      Tensor* result;
+      OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
+      result->scalar<Variant>()() = std::move(output);
+    }
   }
 
  private:
@@ -500,6 +513,65 @@ REGISTER_TENSOR_LIST_GET_ITEM_GPU(bool)
 
 #endif  // GOOGLE_CUDA
 
+class TensorListResize : public OpKernel {
+ public:
+  explicit TensorListResize(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* c) override {
+    const TensorList* input_list =
+        c->input(0).scalar<Variant>()().get<TensorList>();
+    OP_REQUIRES(c, input_list != nullptr,
+                errors::InvalidArgument(
+                    "Input handle is not a list. Saw: '",
+                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    int32 size = c->input(1).scalar<int32>()();
+    OP_REQUIRES(
+        c, size >= 0,
+        errors::InvalidArgument(
+            "TensorListSlice expects size to be non-negative. Got: ", size));
+
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    std::unique_ptr<Tensor> maybe_result = c->forward_input(
+        0, 0, DT_VARIANT, TensorShape{}, c->input_memory_type(0), attr);
+    if (maybe_result != nullptr) {
+      maybe_result->scalar<Variant>()().get<TensorList>()->tensors.resize(
+          size, Tensor(DT_INVALID));
+    } else {
+      Tensor* result;
+      OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
+      TensorList output_list;
+      output_list.element_shape = input_list->element_shape;
+      output_list.element_dtype = input_list->element_dtype;
+      output_list.max_num_elements = input_list->max_num_elements;
+      if (size > input_list->tensors.size()) {
+        output_list.tensors.insert(output_list.tensors.begin(),
+                                   input_list->tensors.begin(),
+                                   input_list->tensors.end());
+        // Add DT_INVALID tensors to the end of the list if the requested size
+        // is larger than the list length.
+        output_list.tensors.resize(size, Tensor(DT_INVALID));
+      } else {
+        output_list.tensors.insert(output_list.tensors.begin(),
+                                   input_list->tensors.begin(),
+                                   input_list->tensors.begin() + size);
+      }
+      result->scalar<Variant>()() = std::move(output_list);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("TensorListResize").Device(DEVICE_CPU),
+                        TensorListResize);
+
+#if GOOGLE_CUDA
+
+REGISTER_KERNEL_BUILDER(
+    Name("TensorListResize").Device(DEVICE_GPU).HostMemory("size"),
+    TensorListResize);
+
+#endif  // GOOGLE_CUDA
+
 class TensorListSetItem : public OpKernel {
  public:
   explicit TensorListSetItem(OpKernelConstruction* c) : OpKernel(c) {
@@ -529,14 +601,21 @@ class TensorListSetItem : public OpKernel {
                     "list index. Item element shape: ",
                     value.shape().DebugString(),
                     " list shape: ", l->element_shape.DebugString()));
-    TensorList output;
-    output = *l;
-    output.tensors[index] = value;
-    Tensor* result;
     AllocatorAttributes attr;
     attr.set_on_host(true);
-    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
-    result->scalar<Variant>()() = std::move(output);
+    std::unique_ptr<Tensor> maybe_result = c->forward_input(
+        0, 0, DT_VARIANT, TensorShape{}, c->input_memory_type(0), attr);
+    if (maybe_result != nullptr) {
+      maybe_result->scalar<Variant>()().get<TensorList>()->tensors[index] =
+          value;
+    } else {
+      TensorList output;
+      output = *l;
+      output.tensors[index] = value;
+      Tensor* result;
+      OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
+      result->scalar<Variant>()() = std::move(output);
+    }
   }
 
  private:
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index 686679474c40dc922683786cdfe65ffb3fbc03e2..fd1be80f11652745410931ed6224cecc47e6c9de 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -65,6 +66,15 @@ struct TensorList {
 
 Status TensorShapeFromTensor(const Tensor& t, PartialTensorShape* out);
 
+// Allocates a Tensor of requested shape and dtype and fills it with zeros.
+template <typename Device, typename T>
+void BuildZerosTensor(OpKernelContext* c, DataType dtype,
+                      const TensorShape& shape, Tensor* zeros) {
+  OP_REQUIRES_OK(c, c->allocate_temp(dtype, shape, zeros));
+  functor::SetZeroFunctor<Device, T> f;
+  f(c->eigen_device<Device>(), zeros->flat<T>());
+}
+
 template <typename Device, typename T>
 class TensorListStack : public OpKernel {
  public:
@@ -94,7 +104,7 @@ class TensorListStack : public OpKernel {
         !tensor_list->tensors.empty() ||
             tensor_list->element_shape.IsFullyDefined(),
         errors::InvalidArgument("Tried to stack elements of a empty ",
-                                "list with non-fully-defined shape: ",
+                                "list with non-fully-defined element_shape: ",
                                 tensor_list->element_shape.DebugString()));
     if (num_elements_ != -1) {
       OP_REQUIRES(c, tensor_list->tensors.size() == num_elements_,
@@ -106,34 +116,59 @@ class TensorListStack : public OpKernel {
     // Compute the shape of the output tensor.
     // If `element_shape` is fully-defined it gets used. It is assumed that all
     // element tensors have the same shape.
-    // If `element_shape` is not fully-defined the shape of the first element
-    // tensor is used and it is checked that all other tensors have the same
-    // shape.
-    TensorShape resulting_shape;
-    if (!tensor_list->element_shape.AsTensorShape(&resulting_shape)) {
-      const Tensor& t = tensor_list->tensors[0];
-      resulting_shape = t.shape();
-      for (int i = 1; i < tensor_list->tensors.size(); ++i) {
+    // If `element_shape` is not fully-defined the shape of the first
+    // initialized element tensor is used and it is checked that all other
+    // initialized tensors have the same shape. An error is thrown if the list
+    // only contains DT_INVALID type tensors.
+    TensorShape resulting_element_shape;
+    if (!tensor_list->element_shape.AsTensorShape(&resulting_element_shape)) {
+      bool resulting_element_shape_initialized = false;
+      for (int i = 0; i < tensor_list->tensors.size(); ++i) {
         const Tensor& t = tensor_list->tensors[i];
-        OP_REQUIRES(c, t.shape() == resulting_shape,
-                    errors::InvalidArgument(
-                        "Tried to stack tensors with unequal shapes: ",
-                        resulting_shape.DebugString(), " vs ",
-                        t.shape().DebugString()));
+        if (!resulting_element_shape_initialized) {
+          if (t.dtype() == DT_INVALID) {
+            continue;
+          }
+          resulting_element_shape = t.shape();
+          resulting_element_shape_initialized = true;
+          continue;
+        }
+        OP_REQUIRES(
+            c, t.dtype() == DT_INVALID || t.shape() == resulting_element_shape,
+            errors::InvalidArgument(
+                "Tried to stack tensors with unequal shapes: ",
+                resulting_element_shape.DebugString(), " vs ",
+                t.shape().DebugString()));
       }
+      OP_REQUIRES(
+          c, resulting_element_shape_initialized,
+          errors::InvalidArgument("Tried to stack list which only contains ",
+                                  "uninitialized tensors and has a ",
+                                  "non-fully-defined element_shape: ",
+                                  tensor_list->element_shape.DebugString()));
     }
-    resulting_shape.InsertDim(0, tensor_list->tensors.size());
+    TensorShape output_tensor_shape = resulting_element_shape;
+    output_tensor_shape.InsertDim(0, tensor_list->tensors.size());
     Tensor* output;
-    OP_REQUIRES_OK(c, c->allocate_output(0, resulting_shape, &output));
+    OP_REQUIRES_OK(c, c->allocate_output(0, output_tensor_shape, &output));
     if (output->NumElements() == 0) {
       return;
     }
 
     ConstMatrixVector inputs_flat;
     inputs_flat.reserve(tensor_list->tensors.size());
+    Tensor zeros;
+    BuildZerosTensor<Device, T>(c, element_dtype_, resulting_element_shape,
+                                &zeros);
     for (const auto& t : tensor_list->tensors) {
-      inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
-          t.shaped<T, 2>({1, t.NumElements()})));
+      if (t.dtype() != DT_INVALID) {
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            t.shaped<T, 2>({1, t.NumElements()})));
+      } else {
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            const_cast<const Tensor&>(zeros).shaped<T, 2>(
+                {1, zeros.NumElements()})));
+      }
     }
     auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
 
@@ -158,6 +193,17 @@ class TensorListConcat : public OpKernel {
       std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>;
   explicit TensorListConcat(OpKernelConstruction* c) : OpKernel(c) {
     OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+    // TODO(skyewm): the HasAttr check can be removed once the
+    // element_shape_except_first_dim attr has been checked in for 2 weeks
+    // (around 1/14/2019).
+    if (c->HasAttr("element_shape")) {
+      PartialTensorShape element_shape;
+      OP_REQUIRES_OK(c, c->GetAttr("element_shape", &element_shape));
+      if (!element_shape.unknown_rank()) {
+        element_shape_except_first_dim_ = PartialTensorShape(
+            gtl::ArraySlice<int64>(element_shape.dim_sizes()).subspan(1));
+      }
+    }
   }
 
   ~TensorListConcat() {}
@@ -178,29 +224,33 @@ class TensorListConcat : public OpKernel {
             " but list elements ", DataTypeString(tensor_list->element_dtype)));
     // If the TensorList is empty, its element_shape must be fully defined
     // except for the first dimension.
-    PartialTensorShape shape_except_first_dim;
-    if (!tensor_list->element_shape.unknown_rank()) {
-      OP_REQUIRES(c, tensor_list->element_shape.dims() >= 1,
-                  errors::InvalidArgument(
-                      "Concat requires elements to be at least vectors, ",
-                      "found scalars instead."));
-      shape_except_first_dim = PartialTensorShape(
-          gtl::ArraySlice<int64>(tensor_list->element_shape.dim_sizes())
-              .subspan(1));
+    if (!element_shape_except_first_dim_.IsFullyDefined()) {
+      if (!tensor_list->element_shape.unknown_rank()) {
+        OP_REQUIRES(c, tensor_list->element_shape.dims() >= 1,
+                    errors::InvalidArgument(
+                        "Concat requires elements to be at least vectors, ",
+                        "found scalars instead."));
+        PartialTensorShape shape_except_first_dim(
+            gtl::ArraySlice<int64>(tensor_list->element_shape.dim_sizes())
+                .subspan(1));
+        PartialTensorShape tmp = element_shape_except_first_dim_;
+        OP_REQUIRES_OK(c, tmp.MergeWith(shape_except_first_dim,
+                                        &element_shape_except_first_dim_));
+      }
     }
     OP_REQUIRES(c,
                 !tensor_list->tensors.empty() ||
-                    shape_except_first_dim.IsFullyDefined(),
+                    element_shape_except_first_dim_.IsFullyDefined(),
                 errors::InvalidArgument(
                     "All except the first dimension must be fully defined ",
                     "when concating an empty tensor list. element_shape: ",
                     tensor_list->element_shape.DebugString()));
     // 1. Compute the shape of the output tensor.
-    // If `shape_except_first_dim` is fully-defined we just prepend the leading
-    // dim to it. Otherwise we use the shape of the first element tensor and
-    // check to make sure shapes of all tensors are compatible.
+    // If `element_shape_except_first_dim_` is fully-defined we just prepend the
+    // leading dim to it. Otherwise we use the shape of the first element tensor
+    // and check to make sure shapes of all tensors are compatible.
     TensorShape output_shape;
-    if (!shape_except_first_dim.AsTensorShape(&output_shape)) {
+    if (!element_shape_except_first_dim_.AsTensorShape(&output_shape)) {
       const Tensor& element_tensor = tensor_list->tensors[0];
       OP_REQUIRES(
           c, TensorShapeUtils::IsVectorOrHigher(element_tensor.shape()),
@@ -268,6 +318,7 @@ class TensorListConcat : public OpKernel {
 
  private:
   DataType element_dtype_;
+  PartialTensorShape element_shape_except_first_dim_;
 };
 
 template <typename Device, typename T>
@@ -505,14 +556,31 @@ class TensorListScatter : public OpKernel {
                     "Specified a list with shape ", element_shape.DebugString(),
                     " from a tensor with shape ", output_shape.DebugString()));
     output_list.element_shape = element_shape;
-    output_list.tensors.reserve(indices.NumElements());
+
+    OP_REQUIRES(c, indices.NumElements() == input_tensor.shape().dim_size(0),
+                errors::InvalidArgument(
+                    "Invalid number of rows in input tensor. Expected: ",
+                    indices.NumElements(),
+                    " Actual: ", input_tensor.shape().dim_size(0)));
+
+    // Validate indices and resize output_list.tensors to fit the highest index.
+    {
+      size_t list_size = 0;
+      for (int index = 0; index < indices.NumElements(); ++index) {
+        const int i = indices.flat<int32>()(index);
+        OP_REQUIRES(c, i >= 0,
+                    errors::InvalidArgument(
+                        "Indices in TensorListScatter must all be positive."));
+        if (i >= list_size) {
+          list_size = i + 1;
+        }
+      }
+      output_list.tensors.resize(list_size, Tensor(DT_INVALID));
+    }
+
     for (int index = 0; index < indices.NumElements(); ++index) {
       const int i = indices.flat<int32>()(index);
-      OP_REQUIRES(c, i < input_tensor.shape().dim_size(0),
-                  errors::InvalidArgument(
-                      "Trying to scatter index ", i, " from tensor with ",
-                      input_tensor.shape().dim_size(0), " rows."));
-      Tensor tmp = input_tensor.Slice(i, i + 1);
+      Tensor tmp = input_tensor.Slice(index, index + 1);
       TensorShape tmp_shape = tmp.shape();
       tmp_shape.RemoveDim(0);
       OP_REQUIRES(c, tmp.CopyFrom(tmp, tmp_shape),
@@ -525,7 +593,7 @@ class TensorListScatter : public OpKernel {
       // many small ondes.
       aligned.flat<T>().device(c->eigen_device<Device>()) =
           tmp.unaligned_flat<T>();
-      output_list.tensors.push_back(aligned);
+      std::swap(output_list.tensors[i], aligned);
     }
     output_tensor->scalar<Variant>()() = std::move(output_list);
   }
diff --git a/tensorflow/core/kernels/map_stage_op.cc b/tensorflow/core/kernels/map_stage_op.cc
index dd89597369bce0dcfd8ae8ad7e2bfc47d8ae2817..27a8696e54647e14eda209c36b7b49c1d171d3bc 100644
--- a/tensorflow/core/kernels/map_stage_op.cc
+++ b/tensorflow/core/kernels/map_stage_op.cc
@@ -480,7 +480,7 @@ class StagingMap : public ResourceBase {
     return map_.size();
   }
 
-  string DebugString() override { return "StagingMap"; }
+  string DebugString() const override { return "StagingMap"; }
 };
 
 template <bool Ordered>
diff --git a/tensorflow/core/kernels/matrix_solve_op.cc b/tensorflow/core/kernels/matrix_solve_op.cc
index 169f3dae76d2fb6d0515d22648a9047657af0032..f3919a16aa50694fa5e05eb2cc421f1dd3f378a1 100644
--- a/tensorflow/core/kernels/matrix_solve_op.cc
+++ b/tensorflow/core/kernels/matrix_solve_op.cc
@@ -214,9 +214,12 @@ class MatrixSolveOpGpu : public AsyncOpKernel {
     auto input_copy_ptrs = solver->GetScratchSpace<uint8>(
         sizeof(Scalar*) * batch_size, "input_copt_ptrs",
         /* on_host */ true);
-    if (n / batch_size <= 128) {
-      // For small matrices or large batch sizes, we use the batched
-      // interface from cuBlas.
+    const int kMaxMatrixSizeToBatchSizeRatio = 128;
+    const bool use_batched_solver =
+        n <= kMaxMatrixSizeToBatchSizeRatio * batch_size;
+    if (use_batched_solver) {
+      // For small matrices or large batch sizes, we use the batched interface
+      // from cuBlas.
       const Scalar** input_copy_ptrs_base =
           reinterpret_cast<const Scalar**>(input_copy_ptrs.mutable_data());
       for (int batch = 0; batch < batch_size; ++batch) {
@@ -230,8 +233,8 @@ class MatrixSolveOpGpu : public AsyncOpKernel {
                                &dev_info.back(), batch_size),
           done);
     } else {
-      // For small batch sizes we use the non-batched interface from cuSolver,
-      // which is much faster for large matrices.
+      // For small batch sizes or large matrices, we use the non-batched
+      // interface from cuSolver, which is much faster for large matrices.
       dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "getrf"));
       for (int batch = 0; batch < batch_size; ++batch) {
         OP_REQUIRES_OK_ASYNC(
@@ -279,11 +282,7 @@ class MatrixSolveOpGpu : public AsyncOpKernel {
         /* on_host */ true);
     auto transposed_rhs_reshaped =
         transposed_rhs.template flat_inner_dims<Scalar, 3>();
-    // TODO(rmlarsen): Enable the following branch when I figure
-    // out why it causes a segfault.
-    if (false && n / batch_size <= 128) {
-      dev_info.push_back(
-          solver->GetDeviceLapackInfo(batch_size, "GetrsBatched"));
+    if (use_batched_solver) {
       const Scalar** input_copy_ptrs_base =
           reinterpret_cast<const Scalar**>(input_copy_ptr_array.mutable_data());
       const Scalar** transposed_rhs_ptrs_base =
@@ -293,13 +292,20 @@ class MatrixSolveOpGpu : public AsyncOpKernel {
         input_copy_ptrs_base[batch] = &input_copy_reshaped(batch, 0, 0);
         transposed_rhs_ptrs_base[batch] = &transposed_rhs_reshaped(batch, 0, 0);
       }
+      int host_info = 0;
       OP_REQUIRES_OK_ASYNC(
           context,
           solver->GetrsBatched(adjoint_ ? CUBLAS_OP_C : CUBLAS_OP_T, n, nrhs,
                                input_copy_ptrs_base, n, pivots_mat.data(),
-                               transposed_rhs_ptrs_base, n, &dev_info.back(),
+                               transposed_rhs_ptrs_base, n, &host_info,
                                batch_size),
           done);
+      OP_REQUIRES_ASYNC(
+          context, host_info == 0,
+          errors::InvalidArgument("The ", -host_info,
+                                  "'th argument to cublas*getrsBatched had "
+                                  "an illegal value."),
+          done);
     } else {
       dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "getrs"));
       for (int batch = 0; batch < batch_size; ++batch) {
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 507fc9983776d2fd54ca66cc70aa7695886b4b5e..ab235843f741a7f8bbfb7fa97cbe438ec5212b72 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 
 #if GOOGLE_CUDA
+#include "cuda/include/cudnn.h"
 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
 #include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
@@ -1134,11 +1135,18 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
                 errors::InvalidArgument(
                     "qint8 should be used with data_format NCHW_VECT_C."));
 
+#if CUDNN_VERSION >= 7300
+    if (use_dnn_) {
+      DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
+                               stride_, padding_, data_format_, tensor_in,
+                               out_shape, propagate_nans_);
+#else
     // These is_int8x4 checks avoid linker errors for missing qint8 kernels.
     if (!is_int8x4 && use_dnn_ && data_format_ == FORMAT_NCHW) {
       DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
                                stride_, padding_, data_format_, tensor_in,
                                out_shape, propagate_nans_);
+#endif
     } else {
       Tensor* output = nullptr;
       OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index 56d0340547a891fe4929bd6a36a72c5e03d1d1e0..f28811ffa4d740e6733b33189a0228bea2428b19 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -390,7 +390,6 @@ bool MaxPoolForwardNoMask_NCHW_VECT_C::operator()(
       0, d.stream()>>>(output_size, bottom_data, height, width, channels,
                        pooled_height, pooled_width, kernel_h, kernel_w,
                        stride_h, stride_w, pad_t, pad_l, top_data);
-  d.synchronize();
   return d.ok();
 }
 
diff --git a/tensorflow/core/kernels/meta_support.cc b/tensorflow/core/kernels/meta_support.cc
index 39e60c9fcef174a4f9e2271600ed847f4e769625..44f2997e182a912476aeab86f1158845b5f1118e 100644
--- a/tensorflow/core/kernels/meta_support.cc
+++ b/tensorflow/core/kernels/meta_support.cc
@@ -54,7 +54,7 @@ class Scratch : public ResourceBase {
 
   uint8_t* buffer() { return scratch_32_aligned_; }
 
-  string DebugString() { return "MetaGemmScratchResource"; }
+  string DebugString() const override { return "MetaGemmScratchResource"; }
 
  private:
   std::unique_ptr<uint8_t> scratch_;
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 2ec6c8fa897464be4dba35a5446b8452d12a40d8..1ae42a0d0d74ef7e2e12fe7427cadfc043774c70 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -13,678 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifdef INTEL_MKL
-
+#include "mkldnn.hpp"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#ifndef INTEL_MKL_ML_ONLY
-#include "mkldnn.hpp"
 using mkldnn::batch_normalization_backward;
 using mkldnn::batch_normalization_forward;
 using mkldnn::prop_kind;
 using mkldnn::stream;
 using mkldnn::use_global_stats;
 using mkldnn::use_scale_shift;
-#else
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-#endif
-
-#include "tensorflow/core/util/mkl_util.h"
-// TODO(inteltf) Address comments from PR 8968.
 
 namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
 
-#ifdef INTEL_MKL_ML_ONLY
-
-template <typename Device, typename T>
-class MklFusedBatchNormOp : public OpKernel {
- public:
-  explicit MklFusedBatchNormOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    float epsilon;
-    OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon));
-    epsilon_ = T(epsilon);
-    string tensor_format;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &tensor_format));
-    OP_REQUIRES(context, FormatFromString(tensor_format, &tensor_format_),
-                errors::InvalidArgument("Invalid data format"));
-    OP_REQUIRES_OK(context, context->GetAttr("is_training", &is_training_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    MklFusedBatchNormOpContext mkl_context;
-    const Tensor& input = MklGetInput(context, 0);
-    const Tensor& scale = MklGetInput(context, 1);
-    const Tensor& shift = MklGetInput(context, 2);
-    const Tensor& est_mean = MklGetInput(context, 3);
-    const Tensor& est_variance = MklGetInput(context, 4);
-
-    GetMklShape(context, 0, &(mkl_context.mkl_shape_input_shape));
-    bool input_in_mkl_format = mkl_context.mkl_shape_input_shape.IsMklTensor();
-
-    if (!input_in_mkl_format) {
-      OP_REQUIRES(context, input.dims() == 4,
-                  errors::InvalidArgument("input must be 4-dimensional",
-                                          input.shape().DebugString()));
-    }
-    OP_REQUIRES(context, scale.dims() == 1,
-                errors::InvalidArgument("scale must be 1-dimensional",
-                                        scale.shape().DebugString()));
-    OP_REQUIRES(context, shift.dims() == 1,
-                errors::InvalidArgument("offset must be 1-dimensional",
-                                        shift.shape().DebugString()));
-    OP_REQUIRES(context, est_mean.dims() == 1,
-                errors::InvalidArgument("estimated_mean must be 1-dimensional",
-                                        est_mean.shape().DebugString()));
-
-    OP_REQUIRES(
-        context, est_variance.dims() == 1,
-        errors::InvalidArgument("estimated_variance must be 1-dimensional",
-                                est_variance.shape().DebugString()));
-
-    if (is_training_) {
-      OP_REQUIRES(context, est_mean.dim_size(0) == 0,
-                  errors::InvalidArgument("estimated_mean empty for training",
-                                          est_mean.shape().DebugString()));
-      OP_REQUIRES(context, est_variance.dim_size(0) == 0,
-                  errors::InvalidArgument(
-                      "estimated_variance must be empty for training",
-                      est_variance.shape().DebugString()));
-    }
-
-    unsigned int flag_batch_norm =
-        is_training_ ? dnnUseScaleShift
-                     : (dnnUseInputMeanVariance | dnnUseScaleShift);
-
-    mkl_context.MklExtractParams(context, tensor_format_);
-
-    // Create layout only for input data as it is used in Op primitive.
-    mkl_context.MklCreateInputLayout(context);
-
-    // Create Op primitive.
-    CHECK_EQ(dnnBatchNormalizationCreateForward_v2_F32(
-                 &(mkl_context.mkl_prim_batchnorm), nullptr,
-                 mkl_context.mkl_lt_input, static_cast<float>(epsilon_),
-                 flag_batch_norm),
-             E_SUCCESS);
-
-    // Temporary tensors with buffers for the context inputs, if
-    // conversion to MKL-Op specific layouts are required. It is assumed here
-    // that TF's 1D tensors (scale, shift, est_mean, and est_variance) won't
-    // require any conversion.
-    // Since scale-shift is combined in MKL, a buffer is required.
-    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_scale_shift_buf_tensor;
-    mkl_context.MklPrepareContextInputs(context, &mkl_tmp_input_buf_tensor,
-                                        &mkl_tmp_scale_shift_buf_tensor);
-
-    // Output data in MKL layout
-    Tensor* output = nullptr;
-    TensorShape tf_shape_output;
-    MklShape mkl_shape_output;
-    mkl_shape_output.SetMklTensor(true);
-    mkl_shape_output.SetMklLayout(mkl_context.mkl_prim_batchnorm,
-                                  dnnResourceDst);
-    mkl_shape_output.SetTfLayout(mkl_context.mkl_params.in_dim,
-                                 mkl_context.mkl_params.in_sizes,
-                                 mkl_context.mkl_params.in_strides);
-    mkl_shape_output.SetTfDimOrder(mkl_context.mkl_params.in_dim,
-                                   tensor_format_);
-    tf_shape_output.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                               mkl_shape_output.GetMklLayout())) /
-                           sizeof(T));
-    AllocateOutputSetMklShape(context, 0, &output, tf_shape_output,
-                              mkl_shape_output);
-    mkl_context.mkl_res_batchnorm[dnnResourceDst] =
-        static_cast<void*>(output->flat<T>().data());
-
-    // Batch mean in TF layout
-    Tensor* batch_mean = nullptr;
-    MklShape mkl_shape_batch_mean;
-    mkl_shape_batch_mean.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, 1, &batch_mean, scale.shape(),
-                              mkl_shape_batch_mean);
-    // Batch variance in TF layout
-    Tensor* batch_variance = nullptr;
-    MklShape mkl_shape_batch_variance;
-    mkl_shape_batch_variance.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, 2, &batch_variance, scale.shape(),
-                              mkl_shape_batch_variance);
-    // If training mode, set dnnResourceMean and dnnResourceVariance to
-    // output tensors for batch mean and variance.
-    // Otherwise, set dnnResourceMean and dnnResourceVariance to
-    // estimated mean and variance.
-    if (is_training_)
-      mkl_context.MklSetMeanVariance(*batch_mean, *batch_variance);
-    else
-      mkl_context.MklSetMeanVariance(est_mean, est_variance);
-
-    // Now that all resources are set, it is ready for dnnExecute
-    CHECK_EQ(dnnExecute_F32(mkl_context.mkl_prim_batchnorm,
-                            mkl_context.mkl_res_batchnorm),
-             E_SUCCESS);
-
-    // Mean and variance (without Bessel's correction) saved for backward
-    // computation to serve as pre-computed mean and variance.
-    Tensor* saved_mean = nullptr;
-    MklShape mkl_shape_saved_mean;
-    mkl_shape_saved_mean.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, 3, &saved_mean, scale.shape(),
-                              mkl_shape_saved_mean);
-    std::memcpy(
-        reinterpret_cast<char*>(saved_mean->flat<float>().data()),
-        reinterpret_cast<char*>(mkl_context.mkl_res_batchnorm[dnnResourceMean]),
-        scale.NumElements() * sizeof(float));
-    Tensor* saved_variance = nullptr;
-    MklShape mkl_shape_saved_variance;
-    mkl_shape_saved_variance.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, 4, &saved_variance, scale.shape(),
-                              mkl_shape_saved_variance);
-    std::memcpy(reinterpret_cast<char*>(saved_variance->flat<float>().data()),
-                reinterpret_cast<char*>(
-                    mkl_context.mkl_res_batchnorm[dnnResourceVariance]),
-                scale.NumElements() * sizeof(float));
-
-    // Bessel's correction on variance, if training mode is on
-    if (is_training_) {
-      float* p_var = static_cast<float*>(batch_variance->flat<T>().data());
-      auto depth = mkl_context.mkl_params.depth;
-      size_t orig_size = mkl_context.mkl_params.in_sizes[0] *
-                         mkl_context.mkl_params.in_sizes[1] *
-                         mkl_context.mkl_params.in_sizes[3];
-      size_t adjust_size = orig_size - 1;
-      float adjust_factor = (static_cast<float>(orig_size)) / adjust_size;
-      for (int i = 0; i < depth; i++) p_var[i] = adjust_factor * p_var[i];
-    }
-
-    mkl_context.MklCleanup();
-  }
-
- private:
-  T epsilon_;
-  TensorFormat tensor_format_;
-  bool is_training_;
-
-  // Structure containing all info for MklOp
-  typedef struct {
-    // Parameters used for input and output layouts
-    struct MklBatchNormParams {
-      // BatchNormOp src and
-      size_t in_dim;
-      size_t in_sizes[4];
-      size_t in_strides[4];
-      size_t depth;  // Batch normalization is done for per channel.
-    } mkl_params;
-
-    MklShape mkl_shape_input_shape;
-
-    // MKL primitive and resources for BatchNormOp
-    dnnPrimitive_t mkl_prim_batchnorm = nullptr;
-    void* mkl_res_batchnorm[dnnResourceNumber];
-
-    // MKL layouts for inputs in the context
-    dnnLayout_t mkl_lt_input = nullptr;
-
-    void MklCleanup() {
-      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
-      if (!input_in_mkl_format) dnnLayoutDelete_F32(mkl_lt_input);
-      if (mkl_prim_batchnorm != nullptr) dnnDelete_F32(mkl_prim_batchnorm);
-    }
-
-    void MklExtractParams(OpKernelContext* context,
-                          const TensorFormat& tensor_format) {
-      const Tensor& input = MklGetInput(context, 0);
-      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
-      mkl_params.in_dim = input_in_mkl_format
-                              ? mkl_shape_input_shape.GetDimension()
-                              : input.dims();
-      mkl_params.in_sizes[0] = static_cast<size_t>(
-          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[0]
-                              : GetTensorDim(input, tensor_format, 'W'));
-      mkl_params.in_sizes[1] = static_cast<size_t>(
-          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[1]
-                              : GetTensorDim(input, tensor_format, 'H'));
-      mkl_params.in_sizes[2] = static_cast<size_t>(
-          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[2]
-                              : GetTensorDim(input, tensor_format, 'C'));
-      mkl_params.in_sizes[3] = static_cast<size_t>(
-          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[3]
-                              : GetTensorDim(input, tensor_format, 'N'));
-      mkl_params.depth = mkl_params.in_sizes[2];
-      GetStridesFromSizes(tensor_format, mkl_params.in_strides,
-                          mkl_params.in_sizes);
-    }
-
-    void MklCreateInputLayout(OpKernelContext* context) {
-      const Tensor& input = MklGetInput(context, 0);
-      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
-      if (input_in_mkl_format) {
-        mkl_lt_input =
-            static_cast<dnnLayout_t>(mkl_shape_input_shape.GetCurLayout());
-      } else {
-        CHECK_EQ(
-            dnnLayoutCreate_F32(&mkl_lt_input, mkl_params.in_dim,
-                                mkl_params.in_sizes, mkl_params.in_strides),
-            E_SUCCESS);
-      }
-    }
-    void MklPrepareContextInputs(OpKernelContext* context,
-                                 Tensor* mkl_tmp_input_buf_tensor,
-                                 Tensor* mkl_tmp_scale_shift_buf_tensor) {
-      bool mkl_convert_input;
-      dnnPrimitive_t mkl_prim_convert_input = nullptr;
-      dnnLayout_t mkl_lt_internal_input = nullptr;
-      void* mkl_buf_converted_input = nullptr;
-      // Compare with internal layouts and convert if needed
-      const Tensor& input = MklGetInput(context, 0);
-      void* mkl_buf_input =
-          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
-                   &mkl_lt_internal_input, mkl_prim_batchnorm, dnnResourceSrc),
-               E_SUCCESS);
-      mkl_convert_input =
-          !dnnLayoutCompare_F32(mkl_lt_internal_input, mkl_lt_input);
-      if (mkl_convert_input) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, mkl_lt_input,
-                                         mkl_lt_internal_input),
-                 E_SUCCESS);
-        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
-                       &mkl_buf_converted_input);
-        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
-                                          mkl_buf_converted_input),
-                 E_SUCCESS);
-        dnnDelete_F32(mkl_prim_convert_input);
-      }
-      dnnLayoutDelete_F32(mkl_lt_internal_input);
-      mkl_res_batchnorm[dnnResourceSrc] =
-          (mkl_convert_input) ? mkl_buf_converted_input : mkl_buf_input;
-
-      // scale-shift layout is created from primitive. So no conversion
-      // is needed, however, a buffer has to be allocated.
-      dnnLayout_t mkl_lt_scale_shift = nullptr;
-      void* mkl_buf_scale_shift = nullptr;
-      CHECK_EQ(
-          dnnLayoutCreateFromPrimitive_F32(
-              &mkl_lt_scale_shift, mkl_prim_batchnorm, dnnResourceScaleShift),
-          E_SUCCESS);
-      AllocTmpBuffer(context, mkl_tmp_scale_shift_buf_tensor,
-                     mkl_lt_scale_shift, &mkl_buf_scale_shift);
-      // Fill the scale-shift buffer with data, presumably buffer is 2D array
-      const Tensor& scale = MklGetInput(context, 1);
-      const Tensor& shift = MklGetInput(context, 2);
-      float* buf_scale_shift = static_cast<float*>(mkl_buf_scale_shift);
-      float* buf_scale = const_cast<float*>(
-          static_cast<const float*>(scale.flat<float>().data()));
-      float* buf_shift = const_cast<float*>(
-          static_cast<const float*>(shift.flat<float>().data()));
-      auto depth = mkl_params.depth;
-      for (int i = 0; i < depth; i++) {
-        buf_scale_shift[i] = buf_scale[i];
-        buf_scale_shift[i + depth] = buf_shift[i];
-      }
-      mkl_res_batchnorm[dnnResourceScaleShift] = mkl_buf_scale_shift;
-    }
-
-    inline void MklSetMeanVariance(const Tensor& mean, const Tensor& variance) {
-      mkl_res_batchnorm[dnnResourceMean] = const_cast<void*>(
-          static_cast<const void*>(mean.flat<float>().data()));
-      mkl_res_batchnorm[dnnResourceVariance] = const_cast<void*>(
-          static_cast<const void*>(variance.flat<float>().data()));
-    }
-  } MklFusedBatchNormOpContext;
-};
-
-template <typename Device, typename T>
-class MklFusedBatchNormGradOp : public OpKernel {
- public:
-  explicit MklFusedBatchNormGradOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    float epsilon;
-    OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon));
-    epsilon_ = T(epsilon);
-    string tensor_format;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &tensor_format));
-    OP_REQUIRES(context, FormatFromString(tensor_format, &tensor_format_),
-                errors::InvalidArgument("Invalid data format"));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    MklFusedBatchNormGradOpContext mkl_context;
-
-    const Tensor& out_backprop = MklGetInput(context, 0);
-    const Tensor& input = MklGetInput(context, 1);
-    const Tensor& scale = MklGetInput(context, 2);
-    const Tensor& saved_mean = MklGetInput(context, 3);
-    const Tensor& saved_var = MklGetInput(context, 4);
-
-    // Here scale, mean, and variance are 1D and considered
-    // those having same layout in MKL and TF
-    GetMklShape(context, 0, &(mkl_context.mkl_shape_out_backprop));
-    GetMklShape(context, 1, &(mkl_context.mkl_shape_input_shape));
-
-    bool input_in_mkl_format = mkl_context.mkl_shape_input_shape.IsMklTensor();
-    bool out_backprop_in_mkl_format =
-        mkl_context.mkl_shape_out_backprop.IsMklTensor();
-    if (!out_backprop_in_mkl_format) {
-      OP_REQUIRES(context, out_backprop.dims() == 4,
-                  errors::InvalidArgument("input must be 4-dimensional",
-                                          out_backprop.shape().DebugString()));
-    }
-    if (!input_in_mkl_format) {
-      OP_REQUIRES(context, input.dims() == 4,
-                  errors::InvalidArgument("input must be 4-dimensional",
-                                          input.shape().DebugString()));
-    }
-    OP_REQUIRES(context, scale.dims() == 1,
-                errors::InvalidArgument("scale must be 1-dimensional",
-                                        scale.shape().DebugString()));
-    OP_REQUIRES(context, saved_mean.dims() == 1,
-                errors::InvalidArgument("saved mean must be 1-dimensional",
-                                        saved_mean.shape().DebugString()));
-    OP_REQUIRES(context, saved_var.dims() == 1,
-                errors::InvalidArgument("saved variance must be 1-dimensional",
-                                        saved_var.shape().DebugString()));
-
-    mkl_context.MklExtractParams(context, tensor_format_);
-
-    mkl_context.MklCreateInputLayout(context);
-
-    unsigned int flag_batch_norm_grad = dnnUseScaleShift;
-
-    // Create Backward Op primitive.
-    CHECK_EQ(dnnBatchNormalizationCreateBackward_v2_F32(
-                 &(mkl_context.mkl_prim_batchnorm_bwd), nullptr,
-                 mkl_context.mkl_lt_input, static_cast<float>(epsilon_),
-                 flag_batch_norm_grad),
-             E_SUCCESS);
-
-    // Temporary tensors and their buffers if conversion is required
-    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_outbackprop_buf_tensor,
-        mkl_tmp_scaleshift_buf_tensor;
-    mkl_context.MklPrepareContextInputs(context, &mkl_tmp_input_buf_tensor,
-                                        &mkl_tmp_outbackprop_buf_tensor,
-                                        &mkl_tmp_scaleshift_buf_tensor);
-
-    // Allocate tensor for grad w.r.t. input(x)
-    Tensor* in_backprop = nullptr;
-    TensorShape tf_shape_in_backprop;
-    MklShape mkl_shape_in_backprop;
-    mkl_shape_in_backprop.SetMklTensor(true);
-    mkl_shape_in_backprop.SetMklLayout(mkl_context.mkl_prim_batchnorm_bwd,
-                                       dnnResourceDiffSrc);
-    mkl_shape_in_backprop.SetTfLayout(mkl_context.mkl_params.in_dims,
-                                      mkl_context.mkl_params.in_sizes,
-                                      mkl_context.mkl_params.in_strides);
-    mkl_shape_in_backprop.SetTfDimOrder(mkl_context.mkl_params.in_dims,
-                                        tensor_format_);
-    tf_shape_in_backprop.AddDim(
-        dnnLayoutGetMemorySize_F32(
-            static_cast<dnnLayout_t>(mkl_shape_in_backprop.GetMklLayout())) /
-        sizeof(T));
-    AllocateOutputSetMklShape(context, 0, &in_backprop, tf_shape_in_backprop,
-                              mkl_shape_in_backprop);
-    mkl_context.mkl_res_batchnorm_bwd[dnnResourceDiffSrc] =
-        static_cast<void*>(in_backprop->flat<T>().data());
-
-    // grad_scale and grad_shift are combined together in MKL
-    // So create a single temporary buffer for those.
-    // Also set dnnResourceDiffScaleShift to the temporary buffer
-    Tensor mkl_tmp_grad_scale_shift_buf_tensor;
-    mkl_context.MklPrepareGradScaleShift(context,
-                                         &mkl_tmp_grad_scale_shift_buf_tensor);
-
-    // All dnn resources are set now, ready to execute
-    CHECK_EQ(dnnExecute_F32(mkl_context.mkl_prim_batchnorm_bwd,
-                            mkl_context.mkl_res_batchnorm_bwd),
-             E_SUCCESS);
-
-    // Now separate out scale and shift grad and copy to individual tensors
-    const TensorShape& tf_shape_scale_shift = scale.shape();
-    // Allocate tensor for grad w.r.t. scale (beta)
-    Tensor* scale_backprop = nullptr;
-    MklShape mkl_shape_scale_backprop;
-    AllocateOutputSetMklShape(context, 1, &scale_backprop, tf_shape_scale_shift,
-                              mkl_shape_scale_backprop);
-
-    // Allocate tensor for grad w.r.t. shift(gamma)
-    Tensor* shift_backprop = nullptr;
-    MklShape mkl_shape_shift_backprop;
-    AllocateOutputSetMklShape(context, 2, &shift_backprop, tf_shape_scale_shift,
-                              mkl_shape_shift_backprop);
-
-    // copy scale and shift grads to tensors
-    float* mkl_buf_scale_shift = const_cast<float*>(static_cast<const float*>(
-        mkl_tmp_grad_scale_shift_buf_tensor.flat<T>().data()));
-    float* tf_buf_scale = const_cast<float*>(
-        static_cast<const float*>(scale_backprop->flat<T>().data()));
-    float* tf_buf_shift = const_cast<float*>(
-        static_cast<const float*>(shift_backprop->flat<T>().data()));
-    auto depth = mkl_context.mkl_params.depth;
-    for (int i = 0; i < depth; i++) {
-      tf_buf_scale[i] = mkl_buf_scale_shift[i];
-      tf_buf_shift[i] = mkl_buf_scale_shift[i + depth];
-    }
-
-    // Two placeholders for estimated_mean and estimated_variance, which are
-    // used for inference and thus not needed here for gradient computation.
-    Tensor* placeholder_1 = nullptr;
-    MklShape mkl_shape_placeholder_1;
-    AllocateOutputSetMklShape(context, 3, &placeholder_1, TensorShape({}),
-                              mkl_shape_placeholder_1);
-    Tensor* placeholder_2 = nullptr;
-    MklShape mkl_shape_placeholder_2;
-    AllocateOutputSetMklShape(context, 4, &placeholder_2, TensorShape({}),
-                              mkl_shape_placeholder_2);
-
-    mkl_context.MklCleanup();
-  }
-
- private:
-  T epsilon_;
-  TensorFormat tensor_format_;
-
-  // Structure containing all info for MklOp
-  typedef struct {
-    // Parameters used for input and output layouts
-    struct MklBatchNormParams {
-      // BatchNormOp src and
-      size_t in_dims;
-      size_t in_sizes[4];
-      size_t in_strides[4];
-      size_t depth;  // Batch normalization is done for per channel.
-    } mkl_params;
-
-    MklShape mkl_shape_out_backprop;
-    MklShape mkl_shape_input_shape;
-
-    // MKL primitive and resources for BatchNormOp
-    dnnPrimitive_t mkl_prim_batchnorm_bwd = nullptr;
-    void* mkl_res_batchnorm_bwd[dnnResourceNumber];
-
-    // MKL layouts for inputs in the context
-    dnnLayout_t mkl_lt_out_backprop = nullptr;
-    dnnLayout_t mkl_lt_input = nullptr;
-
-    void MklCleanup() {
-      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
-      bool out_backprop_in_mkl_format = mkl_shape_out_backprop.IsMklTensor();
-      if (!input_in_mkl_format) dnnLayoutDelete_F32(mkl_lt_input);
-      if (!out_backprop_in_mkl_format) dnnLayoutDelete_F32(mkl_lt_out_backprop);
-
-      dnnDelete_F32(mkl_prim_batchnorm_bwd);
-    }
-
-    void MklExtractParams(OpKernelContext* context,
-                          const TensorFormat& tensor_format) {
-      const Tensor& input = MklGetInput(context, 1);
-      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
-      mkl_params.in_dims = input_in_mkl_format
-                               ? mkl_shape_input_shape.GetDimension()
-                               : input.dims();
-      mkl_params.in_sizes[0] = static_cast<size_t>(
-          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[0]
-                              : GetTensorDim(input, tensor_format, 'W'));
-      mkl_params.in_sizes[1] = static_cast<size_t>(
-          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[1]
-                              : GetTensorDim(input, tensor_format, 'H'));
-      mkl_params.in_sizes[2] = static_cast<size_t>(
-          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[2]
-                              : GetTensorDim(input, tensor_format, 'C'));
-      mkl_params.in_sizes[3] = static_cast<size_t>(
-          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[3]
-                              : GetTensorDim(input, tensor_format, 'N'));
-      mkl_params.depth = mkl_params.in_sizes[2];
-      GetStridesFromSizes(tensor_format, mkl_params.in_strides,
-                          mkl_params.in_sizes);
-    }
-
-    void MklCreateInputLayout(OpKernelContext* context) {
-      const Tensor& input = MklGetInput(context, 0);
-      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
-      if (input_in_mkl_format) {
-        mkl_lt_input =
-            static_cast<dnnLayout_t>(mkl_shape_input_shape.GetCurLayout());
-      } else {
-        CHECK_EQ(
-            dnnLayoutCreate_F32(&mkl_lt_input, mkl_params.in_dims,
-                                mkl_params.in_sizes, mkl_params.in_strides),
-            E_SUCCESS);
-      }
-
-      bool out_backprop_in_mkl_format = mkl_shape_out_backprop.IsMklTensor();
-      if (out_backprop_in_mkl_format) {
-        mkl_lt_out_backprop =
-            static_cast<dnnLayout_t>(mkl_shape_out_backprop.GetCurLayout());
-      } else {
-        CHECK_EQ(
-            dnnLayoutCreate_F32(&mkl_lt_out_backprop, mkl_params.in_dims,
-                                mkl_params.in_sizes, mkl_params.in_strides),
-            E_SUCCESS);
-      }
-    }
-
-    void MklPrepareContextInputs(OpKernelContext* context,
-                                 Tensor* mkl_tmp_input_buf_tensor,
-                                 Tensor* mkl_tmp_outbackprop_buf_tensor,
-                                 Tensor* mkl_tmp_scaleshift_buf_tensor) {
-      bool mkl_convert_input;
-      dnnPrimitive_t mkl_prim_convert_input = nullptr;
-      dnnLayout_t mkl_lt_internal_input = nullptr;
-      void* mkl_buf_converted_input = nullptr;
-      // Compare with internal layouts and convert if needed
-      const Tensor& input = MklGetInput(context, 1);
-      void* mkl_buf_input =
-          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
-      CHECK_EQ(
-          dnnLayoutCreateFromPrimitive_F32(
-              &mkl_lt_internal_input, mkl_prim_batchnorm_bwd, dnnResourceSrc),
-          E_SUCCESS);
-      mkl_convert_input =
-          !dnnLayoutCompare_F32(mkl_lt_internal_input, mkl_lt_input);
-      if (mkl_convert_input) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, mkl_lt_input,
-                                         mkl_lt_internal_input),
-                 E_SUCCESS);
-        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
-                       &mkl_buf_converted_input);
-        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
-                                          mkl_buf_converted_input),
-                 E_SUCCESS);
-        dnnDelete_F32(mkl_prim_convert_input);
-      }
-      dnnLayoutDelete_F32(mkl_lt_internal_input);
-      mkl_res_batchnorm_bwd[dnnResourceSrc] =
-          (mkl_convert_input) ? mkl_buf_converted_input : mkl_buf_input;
-
-      bool mkl_convert_out_backprop;
-      dnnPrimitive_t mkl_prim_convert_out_backprop = nullptr;
-      dnnLayout_t mkl_lt_internal_out_backprop = nullptr;
-      void* mkl_buf_converted_out_backprop = nullptr;
-      // Compare with internal layouts and convert if needed
-      const Tensor& out_backprop = MklGetInput(context, 0);
-      void* mkl_buf_out_backprop = const_cast<void*>(
-          static_cast<const void*>(out_backprop.flat<T>().data()));
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_out_backprop,
-                                                mkl_prim_batchnorm_bwd,
-                                                dnnResourceDiffDst),
-               E_SUCCESS);
-      mkl_convert_out_backprop = !dnnLayoutCompare_F32(
-          mkl_lt_internal_out_backprop, mkl_lt_out_backprop);
-      if (mkl_convert_out_backprop) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_out_backprop,
-                                         mkl_lt_out_backprop,
-                                         mkl_lt_internal_out_backprop),
-                 E_SUCCESS);
-        AllocTmpBuffer(context, mkl_tmp_outbackprop_buf_tensor,
-                       mkl_lt_internal_out_backprop,
-                       &mkl_buf_converted_out_backprop);
-        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_out_backprop,
-                                          mkl_buf_out_backprop,
-                                          mkl_buf_converted_out_backprop),
-                 E_SUCCESS);
-        dnnDelete_F32(mkl_prim_convert_out_backprop);
-      }
-      dnnLayoutDelete_F32(mkl_lt_internal_out_backprop);
-      mkl_res_batchnorm_bwd[dnnResourceDiffDst] =
-          (mkl_convert_out_backprop) ? mkl_buf_converted_out_backprop
-                                     : mkl_buf_out_backprop;
-
-      // Set dnnResourceMean and dnnResourceVariance
-      const Tensor& saved_mean = MklGetInput(context, 3);
-      const Tensor& saved_var = MklGetInput(context, 4);
-      void* mkl_buf_saved_mean = const_cast<void*>(
-          static_cast<const void*>(saved_mean.flat<T>().data()));
-      void* mkl_buf_saved_var = const_cast<void*>(
-          static_cast<const void*>(saved_var.flat<T>().data()));
-      mkl_res_batchnorm_bwd[dnnResourceMean] = mkl_buf_saved_mean;
-      mkl_res_batchnorm_bwd[dnnResourceVariance] = mkl_buf_saved_var;
-
-      // Set dnnResourceScaleShift
-      // Note backward Op needs only current values of scale parameters,
-      // shift parameters could be garbage and won't be used
-      const Tensor& scale = MklGetInput(context, 2);
-      dnnLayout_t mkl_lt_scale_shift = nullptr;
-      void* mkl_buf_scale_shift = nullptr;
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_scale_shift,
-                                                mkl_prim_batchnorm_bwd,
-                                                dnnResourceScaleShift),
-               E_SUCCESS);
-      AllocTmpBuffer(context, mkl_tmp_scaleshift_buf_tensor, mkl_lt_scale_shift,
-                     &mkl_buf_scale_shift);
-      float* pscale =
-          const_cast<float*>(static_cast<const float*>(scale.flat<T>().data()));
-      float* pscale_shift = static_cast<float*>(mkl_buf_scale_shift);
-      auto depth = mkl_params.depth;
-      for (int i = 0; i < depth; i++) pscale_shift[i] = pscale[i];
-      mkl_res_batchnorm_bwd[dnnResourceScaleShift] = mkl_buf_scale_shift;
-      dnnLayoutDelete_F32(mkl_lt_scale_shift);
-    }
-
-    void MklPrepareGradScaleShift(OpKernelContext* context,
-                                  Tensor* mkl_tmp_grad_scale_shift_buf_tensor) {
-      dnnLayout_t mkl_lt_grad_scaleshift = nullptr;
-      void* mkl_buf_grad_scaleshift = nullptr;
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_grad_scaleshift,
-                                                mkl_prim_batchnorm_bwd,
-                                                dnnResourceDiffScaleShift),
-               E_SUCCESS);
-      AllocTmpBuffer(context, mkl_tmp_grad_scale_shift_buf_tensor,
-                     mkl_lt_grad_scaleshift, &mkl_buf_grad_scaleshift);
-      mkl_res_batchnorm_bwd[dnnResourceDiffScaleShift] =
-          mkl_buf_grad_scaleshift;
-      dnnLayoutDelete_F32(mkl_lt_grad_scaleshift);
-    }
-  } MklFusedBatchNormGradOpContext;
-};
-#endif
-
-#ifndef INTEL_MKL_ML_ONLY
-
 struct MklBatchNormFwdParams {
   memory::dims src_dims;
   int depth;
@@ -1765,8 +1112,6 @@ class MklFusedBatchNormGradOp : public OpKernel {
   memory::dims GetMeanVarianceDims() { return memory::dims({1, depth_}); }
 };
 
-#endif
-
 #define REGISTER_MKL_CPU(T)                                         \
   REGISTER_KERNEL_BUILDER(Name("_MklFusedBatchNorm")                \
                               .Device(DEVICE_CPU)                   \
diff --git a/tensorflow/core/kernels/mutex_ops.cc b/tensorflow/core/kernels/mutex_ops.cc
index ddb7a606c1a7f0264c7c4a9cbb2f97095d9fee01..1603a2aa869e4959713741bfb501798193a63d42 100644
--- a/tensorflow/core/kernels/mutex_ops.cc
+++ b/tensorflow/core/kernels/mutex_ops.cc
@@ -45,7 +45,9 @@ class Mutex : public ResourceBase {
     VLOG(2) << "Creating mutex with name " << name << ": " << this;
   }
 
-  string DebugString() override { return strings::StrCat("Mutex ", name_); }
+  string DebugString() const override {
+    return strings::StrCat("Mutex ", name_);
+  }
 
   class LockReleaser {
    public:
diff --git a/tensorflow/core/kernels/neon/BUILD b/tensorflow/core/kernels/neon/BUILD
index 313d40c082b3e334a01ba97eaf4449e1940b013a..6665152e3e3c7592cda8e0a09dd75d4b2409d6c4 100644
--- a/tensorflow/core/kernels/neon/BUILD
+++ b/tensorflow/core/kernels/neon/BUILD
@@ -24,7 +24,6 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core/kernels:bounds_check",
         "//tensorflow/core/kernels:ops_util",
         "@gemmlowp",
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index fbecd909beacd88d80384a259345727981b64b6c..cadb83d8cf934dba8bbf4c3706c6e5edff381b10 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -13,33 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/common_runtime/optimization_registry.h"
-#include "tensorflow/core/common_runtime/placer.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/graph/graph_partition.h"
-#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
-#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
-#include "tensorflow/core/grappler/utils/functions.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 #include "tensorflow/core/util/ptr_util.h"
-#include "tensorflow/core/util/reffed_status_callback.h"
 
 #if GOOGLE_CUDA
 #include "tensorflow/stream_executor/stream.h"
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
-typedef FunctionLibraryRuntime::Handle FHandle;
-
 namespace {
 // A `PartitionedCallOp` asynchronously executes a function, potentially across
 // multiple devices but within a single process. The kernel places and
@@ -77,7 +67,15 @@ class PartitionedCallOp : public AsyncOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("executor_type", &executor_type_));
   }
 
-  ~PartitionedCallOp() override {}
+  ~PartitionedCallOp() override {
+    for (const auto& it : handles_) {
+      Status status = it.first->ReleaseHandle(it.second);
+      if (!status.ok()) {
+        LOG(INFO) << "Ignoring error while destructing PartitionedCallOp: "
+                  << status.ToString();
+      }
+    }
+  }
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     FunctionLibraryRuntime* lib = ctx->function_library();
@@ -85,9 +83,6 @@ class PartitionedCallOp : public AsyncOpKernel {
                       errors::Internal("No function library is provided."),
                       done);
 
-    OpInputList args;
-    OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("args", &args), done);
-
     // The function body's graph is placed and partitioned the first time
     // `ComputeAsync` is invoked; every subsequent invocation calls each
     // of the function shards yielded by partitioning.
@@ -97,526 +92,159 @@ class PartitionedCallOp : public AsyncOpKernel {
     // Inputs and outputs are pinned to the local device, for simplicity.
     //
     // TODO(akshayka): Support re-sharding the function on subsequent calls,
-    // via, e.g., virtual device annotations and a list of device names supplied
-    // through an attribute.
+    // via, e.g., virtual device annotations and a list of device names
+    // supplied through an attribute.
     //
     // TODO(akshayka): Add a fastpath for functions that execute on a single
     // device.
+    FunctionLibraryRuntime::Handle handle;
+    // If we are instantiating the function, we can efficiently extract the
+    // inputs while instantiating. Else, we extract them separately below.
+    std::vector<Tensor> inputs;
+    bool inputs_extracted = false;
     {
       mutex_lock l(mu_);
-      if (function_handles_.find(lib) == function_handles_.end()) {
-        // TODO(b/37549631): Because this kernel may correspond to a stateful
-        // op, it may be shared by multiple subgraphs, which in turn may have
-        // different `FunctionLibraryRuntime` objects and therefore different
-        // `FHandle` namespaces. As such, we partition on a per-FLR basis.
-        FunctionLibraryRuntime::InstantiateOptions opts;
-        FHandle handle;
-        OP_REQUIRES_OK_ASYNC(
-            ctx,
-            lib->Instantiate(func_.name(), AttrSlice(&func_.attr()), opts,
-                             &handle),
-            done);
-        const FunctionBody* fbody = lib->GetFunctionBody(handle);
-        OP_REQUIRES_ASYNC(ctx, fbody != nullptr,
-                          errors::Internal("Could not find handle ", handle),
-                          done);
-        OP_REQUIRES_ASYNC(
-            ctx, args.size() == fbody->arg_nodes.size(),
-            errors::InvalidArgument(
-                "Wrong number of arguments to the op; function expects ",
-                fbody->arg_nodes.size(), " but PartitionedCall received ",
-                args.size()),
-            done);
-        // We need to pass global op_registry as default_registry when creating
-        // graph. So that graph optimization passes can lookup all possible ops
-        // by name.
-        auto graph = tensorflow::MakeUnique<Graph>(fbody->graph->flib_def());
-        FunctionLibraryDefinition global_flib(OpRegistry::Global(), {});
-        TF_CHECK_OK(graph->AddFunctionLibrary(global_flib.ToProto()));
-        CopyGraph(*fbody->graph, graph.get());
-        OP_REQUIRES_OK_ASYNC(ctx, PinResourceArgs(graph.get(), args), done);
-
-        DeviceSet device_set;
-        for (auto d : lib->device_mgr()->ListDevices()) {
-          device_set.AddDevice(d);
-        }
-
-        // The FunctionLibraryRuntime's library cannot be mutated from within
-        // an OpKernel, so functions are instantiated in an overlay library.
-        OP_REQUIRES_ASYNC(
-            ctx, overlay_libs_.find(lib) == overlay_libs_.end(),
-            errors::Internal("Found an overlay library but did not "
-                             "find cached function partitions; "
-                             "this indicates a bug."),
-            done);
-        // We do not need a full function library in the overlay, we just keep a
-        // subset that is reachable from the instantiated function.
-        FunctionLibraryDefinition* overlay_lib = new FunctionLibraryDefinition(
-            grappler::ReachableFunctionLibraryDefinition(
-                *lib->GetFunctionLibraryDefinition(), fbody->fdef));
-        overlay_libs_.emplace(lib, overlay_lib);
-
-        GraphOptimizationPassOptions optimization_options;
-        // TODO(akshayka): Thread SessionOptions (if any) into this kernel, or
-        // make it possible to specify the relevant options via attributes.
-        SessionOptions session_options;
-        session_options.env = ctx->env();
-        optimization_options.session_options = &session_options;
-        optimization_options.graph = &graph;
-        optimization_options.flib_def = overlay_lib;
-        optimization_options.device_set = &device_set;
-        OP_REQUIRES_OK_ASYNC(
-            ctx,
-            OptimizationPassRegistry::Global()->RunGrouping(
-                OptimizationPassRegistry::PRE_PLACEMENT, optimization_options),
-            done);
-
-        // Make the FunctionLibraryRuntime's device the default device if
-        // nothing else is hard coded. This allows the same function definition
-        // to be specialized to different devices depending on the
-        // PartitionedCallOp's device.
-        Placer placer(graph.get(), &device_set,
-                      nullptr, /* No session options */
-                      lib->device() /* Default device */);
-        OP_REQUIRES_OK_ASYNC(ctx, placer.Run(), done);
-        OP_REQUIRES_OK_ASYNC(
-            ctx,
-            OptimizationPassRegistry::Global()->RunGrouping(
-                OptimizationPassRegistry::POST_PLACEMENT, optimization_options),
-            done);
-
-        Device* cpu_device;
-        OP_REQUIRES_OK_ASYNC(
-            ctx, lib->device_mgr()->LookupDevice("CPU:0", &cpu_device), done);
-
-        // Run grappler passes on the graph. It is possible that these are
-        // optimized by the graph executor already.
-        Status optimized = OptimizeGraph(ctx, fbody->ret_nodes, overlay_lib,
-                                         device_set, cpu_device, &graph);
-        if (!optimized.ok()) {
-          LOG(WARNING) << "Grappler optimization failed. Error: "
-                       << optimized.error_message();
-        }
-
-        OP_REQUIRES_OK_ASYNC(
-            ctx,
-            OptimizationPassRegistry::Global()->RunGrouping(
-                OptimizationPassRegistry::POST_REWRITE_FOR_EXEC,
-                optimization_options),
-            done);
-
-        std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
-        OP_REQUIRES_OK_ASYNC(
-            ctx, PartitionHelper(device_set, std::move(graph), &subgraphs),
-            done);
-        if (ctx->graph_collector() != nullptr) {
-          for (const auto& pair : subgraphs) {
-            GraphDef def;
-            pair.second->ToGraphDef(&def);
-            ctx->graph_collector()->CollectGraph(def);
-          }
-        }
-        optimization_options.graph = nullptr;
-        optimization_options.device_set = nullptr;
-        optimization_options.partition_graphs = &subgraphs;
-        OP_REQUIRES_OK_ASYNC(ctx,
-                             OptimizationPassRegistry::Global()->RunGrouping(
-                                 OptimizationPassRegistry::POST_PARTITIONING,
-                                 optimization_options),
+      auto it = handles_.find(lib);
+      if (it == handles_.end()) {
+        OP_REQUIRES_OK_ASYNC(ctx, Instantiate(lib, ctx, &inputs, &handle),
                              done);
+        inputs_extracted = true;
+        handles_[lib] = handle;
+      } else {
+        handle = it->second;
+      }
+    }
 
-        auto handles = tensorflow::MakeUnique<gtl::FlatMap<string, FHandle>>();
-        for (const auto& pair : subgraphs) {
-          // TODO(akshayka): Fail gracefully if the set of devices corresponds
-          // to more than one address space.
-          const string& target = pair.first;
-          const auto& subgraph = pair.second;
-          OP_REQUIRES_OK_ASYNC(
-              ctx, UpdateArgAndRetMetadata(target, subgraph.get()), done);
-          FunctionDef shard;
-          string unique_name = UniquifyFunctionName(overlay_lib, func_.name());
-          OP_REQUIRES_OK_ASYNC(
-              ctx, GraphToFunctionDef(*subgraph, unique_name, &shard), done);
-          OP_REQUIRES_OK_ASYNC(ctx, overlay_lib->AddFunctionDef(shard), done);
-          FunctionLibraryRuntime::InstantiateOptions opts;
-          opts.executor_type = executor_type_;
-          opts.target = target;
-          opts.overlay_lib = overlay_lib;
-          FHandle handle;
-          OP_REQUIRES_OK_ASYNC(
-              ctx,
-              lib->Instantiate(unique_name, AttrSlice(&shard.attr()), opts,
-                               &handle),
-              done);
-          handles->emplace(target, handle);
-        }
-
-        function_handles_.emplace(lib, std::move(handles));
+    if (!inputs_extracted) {
+      OpInputList args;
+      OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("args", &args), done);
+      inputs.reserve(args.size());
+      for (const Tensor& tensor : args) {
+        inputs.push_back(tensor);
       }
     }
-    ExecuteFunctions(lib, ctx, args, std::move(done));
+
+    RunFunction(handle, inputs, lib, ctx, done);
   }
 
  private:
-  typedef std::pair<string, FHandle> DeviceAndFHandle;
-  typedef std::pair<std::vector<int>, std::vector<int>> ArgAndRetIndices;
-  typedef std::pair<std::vector<AllocatorAttributes>,
-                    std::vector<AllocatorAttributes>>
-      ArgAndRetAllocAttrs;
-
-  // Pins each arg that emits a `DT_RESOURCE` tensor to the device on which the
-  // corresponding resource lives. This ensures that the Placer assigns ops that
-  // access these resources to the appropriate devices.
-  Status PinResourceArgs(Graph* graph, const OpInputList& args) {
-    for (Node* node : graph->op_nodes()) {
-      string node_type = node->type_string();
-      if (node_type == FunctionLibraryDefinition::kArgOp) {
-        const AttrValue* attr_value;
-        TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
-        int index = attr_value->i();
-        TF_RETURN_IF_ERROR(node->attrs().Find("T", &attr_value));
-        DataType dtype = attr_value->type();
-        if (dtype != args[index].dtype()) {
-          return errors::InvalidArgument("For argument ", index, " expected ",
-                                         DataTypeString(dtype), " tensor, got ",
-                                         DataTypeString(args[index].dtype()),
-                                         " instead.");
-        }
-        if (dtype == DT_RESOURCE) {
-          const ResourceHandle& handle = args[index].flat<ResourceHandle>()(0);
-          node->set_assigned_device_name(handle.device());
-        }
-      }
+  Status FillOutputDevices(const FunctionLibraryRuntime& lib,
+                           const Device& cpu_device, AttrSlice attrs,
+                           FunctionLibraryRuntime::InstantiateOptions* opts) {
+    const FunctionLibraryDefinition* flib = lib.GetFunctionLibraryDefinition();
+    const FunctionDef* fdef = flib->Find(func_.name());
+    if (fdef == nullptr) {
+      return errors::NotFound("Failed for find definiton for function \"",
+                              func_.name(), "\"");
     }
-    return Status::OK();
-  }
 
-  // Partitions `graph` and populates `subgraphs` with the partitions.
-  Status PartitionHelper(
-      const DeviceSet& device_set, std::unique_ptr<Graph> graph,
-      std::unordered_map<string, std::unique_ptr<Graph>>* subgraphs) {
-    PartitionOptions partition_options;
-    partition_options.node_to_loc = [](const Node* node) {
-      // TODO(akshayka): To better support the distributed case, first split
-      // the graph by worker (e.g,. using the master session's
-      // `SplitByWorker` policy), and then recursively partition the
-      // per-worker shards at the remote worker(s).
-      return node->assigned_device_name();
-    };
-    int64 edge_name_counter = 0;
-    partition_options.new_name = [&edge_name_counter](const string& prefix) {
-      return strings::StrCat(prefix, "/_", ++edge_name_counter);
-    };
-    partition_options.get_incarnation =
-        [&device_set](const string& name) -> int64 {
-      const Device* d = device_set.FindDeviceByName(name);
-      if (d == nullptr) {
-        return PartitionOptions::kIllegalIncarnation;
-      } else {
-        return d->attributes().incarnation();
+    bool is_type_list;
+    for (const OpDef::ArgDef& ret_def : fdef->signature().output_arg()) {
+      DataTypeVector dtypes;
+      TF_RETURN_IF_ERROR(ArgNumType(attrs, ret_def, &is_type_list, &dtypes));
+      for (DataType dtype : dtypes) {
+        if (MTypeFromDType(dtype) == HOST_MEMORY) {
+          opts->output_devices.push_back(cpu_device.name());
+        } else {
+          opts->output_devices.push_back(opts->target);
+        }
       }
-    };
-    partition_options.control_flow_added = false;
-    std::unordered_map<string, GraphDef> partitions;
-    TF_RETURN_IF_ERROR(Partition(partition_options, graph.get(), &partitions));
-
-    VLOG(3) << "Partitioned function '" << func_.name() << "', yielding "
-            << partitions.size() << " shards.";
-
-    for (const auto& partition : partitions) {
-      std::unique_ptr<Graph> subgraph(new Graph(graph->flib_def()));
-      FunctionLibraryDefinition global_flib(OpRegistry::Global(), {});
-      TF_CHECK_OK(subgraph->AddFunctionLibrary(global_flib.ToProto()));
-      GraphConstructorOptions opts;
-      opts.allow_internal_ops = true;
-      opts.expect_device_spec = true;
-      const string& device = partition.first;
-      const GraphDef& graph_def = partition.second;
-      TF_RETURN_IF_ERROR(
-          ConvertGraphDefToGraph(opts, graph_def, subgraph.get()));
-      subgraphs->emplace(device, std::move(subgraph));
     }
-
     return Status::OK();
   }
 
-  // Each subgraph produced by partitioning the function body contains a subset
-  // of the original `Arg` and `Retval` nodes. This function performs
-  // bookkeeping to track which `Arg` and `Retval` nodes were placed on a
-  // particular device / subgraph.
-  //
-  // More specifically, this function
-  //  (1) rewrites the indices of the `Arg` and `Retval` nodes placed on a
-  //      particular device,
-  //  (2) records the subsets of `Arg` and `Retval` nodes assigned to the
-  //      device, and
-  //  (3) records which `Arg` and `Retval` nodes live in host memory.
-  Status UpdateArgAndRetMetadata(const string& device, Graph* subgraph) {
-    ArgAndRetIndices indices;
-    std::vector<int>* arg_indices = &indices.first;
-    std::vector<int>* ret_indices = &indices.second;
-    std::vector<std::pair<Node*, int>> arg_nodes;
-    std::vector<std::pair<Node*, int>> ret_nodes;
-    const AttrValue* attr_value;
+  Status Instantiate(FunctionLibraryRuntime* lib, OpKernelContext* ctx,
+                     std::vector<Tensor>* inputs,
+                     FunctionLibraryRuntime::Handle* handle) {
+    FunctionLibraryRuntime::InstantiateOptions opts;
+    opts.target = lib->device()->name();
+    opts.is_multi_device_function = true;
+    opts.optimize_graph_fn =
+        std::bind(grappler::OptimizeGraph, std::placeholders::_1,
+                  std::placeholders::_2, std::placeholders::_3,
+                  std::placeholders::_4, config_proto_, std::placeholders::_5);
+    opts.graph_collector = ctx->graph_collector();
+    opts.executor_type = executor_type_;
 
-    // Find the Arg and Retval nodes, along with their corresponding indices
-    // in the original function.
-    for (Node* node : subgraph->op_nodes()) {
-      string node_type = node->type_string();
-      if (node_type == FunctionLibraryDefinition::kArgOp) {
-        TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
-        int index = attr_value->i();
-        arg_indices->push_back(index);
-        arg_nodes.push_back(std::make_pair(node, index));
-      } else if (node_type == FunctionLibraryDefinition::kRetOp) {
-        TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
-        int index = attr_value->i();
-        ret_indices->push_back(index);
-        ret_nodes.push_back(std::make_pair(node, index));
+    OpInputList args;
+    TF_RETURN_IF_ERROR(ctx->input_list("args", &args));
+    Device* cpu_device;
+    TF_RETURN_IF_ERROR(lib->device_mgr()->LookupDevice("CPU:0", &cpu_device));
+
+    inputs->reserve(args.size());
+    for (const Tensor& tensor : args) {
+      inputs->push_back(tensor);
+      DataType dtype = tensor.dtype();
+      if (dtype == DT_RESOURCE) {
+        const ResourceHandle& handle = tensor.flat<ResourceHandle>()(0);
+        opts.input_devices.push_back(handle.device());
+      } else if (MTypeFromDType(dtype) == HOST_MEMORY) {
+        opts.input_devices.push_back(cpu_device->name());
+      } else {
+        opts.input_devices.push_back(opts.target);
       }
     }
 
-    for (int i = 0; i < arg_nodes.size(); ++i) {
-      Node* arg = arg_nodes[i].first;
-      arg->AddAttr("index", i);
-      TF_RETURN_IF_ERROR(arg->attrs().Find("T", &attr_value));
-      AllocatorAttributes alloc_attr;
-      DataType type = attr_value->type();
-      if (MTypeFromDType(type) == HOST_MEMORY) {
-        alloc_attr.set_on_host(true);
-      }
-      arg_and_ret_alloc_attrs_[device].first.push_back(alloc_attr);
-    }
-    for (int i = 0; i < ret_nodes.size(); ++i) {
-      Node* ret = ret_nodes[i].first;
-      ret->AddAttr("index", i);
-      TF_RETURN_IF_ERROR(ret->attrs().Find("T", &attr_value));
-      AllocatorAttributes alloc_attr;
-      DataType type = attr_value->type();
-      if (MTypeFromDType(type) == HOST_MEMORY) {
-        alloc_attr.set_on_host(true);
-      }
-      arg_and_ret_alloc_attrs_[device].second.push_back(alloc_attr);
-    }
+    TF_RETURN_IF_ERROR(
+        FillOutputDevices(*lib, *cpu_device, AttrSlice(&func_.attr()), &opts));
 
-    // If this kernel execution corresponds to a StatefulPartitionedCallOp,
-    // `arg_and_ret_indices_` might have been populated by a previous
-    // invocation.
-    if (arg_and_ret_indices_.find(device) == arg_and_ret_indices_.end()) {
-      arg_and_ret_indices_.emplace(device, indices);
-    }
+    TF_RETURN_IF_ERROR(
+        lib->Instantiate(func_.name(), AttrSlice(&func_.attr()), opts, handle));
     return Status::OK();
   }
 
-  std::vector<Tensor> GetArgsForIndices(const std::vector<int>& indices,
-                                        const OpInputList& arguments) {
-    std::vector<Tensor> args;
-    args.reserve(indices.size());
-    for (int i : indices) {
-      args.push_back(arguments[i]);
-    }
-    return args;
-  }
-
-  void ExecuteFunctions(FunctionLibraryRuntime* lib, OpKernelContext* ctx,
-                        const OpInputList& op_args, DoneCallback done)
-      LOCKS_EXCLUDED(mu_) {
-    const gtl::FlatMap<string, FHandle>* handles;
-    {
-      mutex_lock l(mu_);
-      handles = function_handles_[lib].get();
-    }
-    if (handles->empty()) {
-      // Trivial case where the function body is empty.
-      ctx->SetStatus(Status::OK());
-      done();
-      return;
-    }
-
-    const string& local_device_name = lib->device()->name();
-    FunctionLibraryRuntime::Options opts;
-    opts.step_id = ctx->step_id();
-    opts.step_container = ctx->step_container();
-    opts.cancellation_manager = ctx->cancellation_manager();
-    opts.stats_collector = ctx->stats_collector();
-    // TODO(akshayka): Consider selecting a runner on a per-device basis, i.e.,
-    // using device-specific threadpools when available.
-    opts.runner = ctx->runner();
-    opts.source_device = local_device_name;
-    opts.allow_dead_tensors = true;
+  void RunFunction(FunctionLibraryRuntime::Handle handle,
+                   const std::vector<Tensor>& inputs,
+                   FunctionLibraryRuntime* lib, OpKernelContext* ctx,
+                   DoneCallback done) {
+    FunctionLibraryRuntime::Options run_opts;
+    run_opts.step_id = ctx->step_id();
+    run_opts.step_container = ctx->step_container();
+    run_opts.cancellation_manager = ctx->cancellation_manager();
+    run_opts.stats_collector = ctx->stats_collector();
+    run_opts.collective_executor = ctx->collective_executor();
+    // TODO(akshayka): Consider selecting a runner on a per-device basis,
+    // i.e., using device-specific threadpools when available.
+    run_opts.runner = ctx->runner();
+    run_opts.source_device = lib->device()->name();
+    run_opts.allow_dead_tensors = true;
     // TODO(akshayka): Accommodate the multiple-worker scenario by adding the
     // constructed rendezvous to a rendezvous manager.
     Rendezvous* rendez = new IntraProcessRendezvous(lib->device_mgr());
-    opts.rendezvous = rendez;
-
-    StatusCallback callback = std::bind(
-        [](Rendezvous* rendez, DoneCallback& done, const Status& status) {
-          rendez->Unref();
-          done();
-        },
-        rendez, std::move(done), std::placeholders::_1);
-    auto* refcounted_done = new ReffedStatusCallback(std::move(callback));
-    for (int i = 0; i < handles->size(); ++i) {
-      refcounted_done->Ref();
-    }
-
-    for (const auto& pair : *handles) {
-      const string& target = pair.first;
-      FHandle handle = pair.second;
-      VLOG(3) << "Running function shard on device " << target;
-      ArgAndRetIndices indices = arg_and_ret_indices_[target];
-      ArgAndRetAllocAttrs alloc_attrs = arg_and_ret_alloc_attrs_[target];
-      const std::vector<int>& arg_indices = indices.first;
-      const std::vector<int>& ret_indices = indices.second;
-      opts.args_alloc_attrs = alloc_attrs.first;
-      opts.rets_alloc_attrs = alloc_attrs.second;
-      if (target == local_device_name) {
-        opts.remote_execution = false;
-        std::vector<Tensor> args = GetArgsForIndices(arg_indices, op_args);
-        std::vector<Tensor>* rets = new std::vector<Tensor>;
-        lib->Run(
-            opts, handle, args, rets,
-            [rets, ret_indices, refcounted_done, ctx](const Status& status) {
-              if (!status.ok()) {
-                VLOG(3) << "Local execution failed: " << status;
-                ctx->SetStatus(status);
-              } else {
-                for (int i = 0; i < rets->size(); ++i) {
-                  ctx->set_output(ret_indices[i], (*rets)[i]);
-                }
-              }
-              delete rets;
-              VLOG(3) << "Finished local execution.";
-              refcounted_done->Unref();
-            });
-      } else {
-        opts.remote_execution = true;
-        std::vector<Tensor> args = GetArgsForIndices(arg_indices, op_args);
-        std::vector<Tensor>* rets = new std::vector<Tensor>;
-        lib->Run(
-            opts, handle, args, rets,
-            [rets, ret_indices, refcounted_done, ctx](const Status& status) {
-              if (!status.ok()) {
-                VLOG(3) << "Remote execution failed: " << status;
-                ctx->SetStatus(status);
-              } else {
-                for (int i = 0; i < rets->size(); ++i) {
-                  ctx->set_output(ret_indices[i], (*rets)[i]);
-                }
-              }
-              delete rets;
-              VLOG(3) << "Finished remote execution.";
-              refcounted_done->Unref();
-            });
-      }
-    }
-    refcounted_done->Unref();
-  }
-
-  string UniquifyFunctionName(const FunctionLibraryDefinition* function_library,
-                              const string& name) {
-    for (;; ++suffix_) {
-      const string candidate = strings::StrCat(name, "_", suffix_);
-      if (function_library->Find(candidate) == nullptr) {
-        return candidate;
-      }
-    }
-  }
-
-  Status OptimizeGraph(OpKernelContext* ctx,
-                       const gtl::InlinedVector<Node*, 4>& ret_nodes,
-                       FunctionLibraryDefinition* flib,
-                       const DeviceSet& device_set, Device* cpu_device,
-                       std::unique_ptr<Graph>* graph) {
-    if (!tensorflow::grappler::MetaOptimizerEnabled(config_proto_)) {
-      return Status::OK();
-    }
-
-    tensorflow::grappler::GrapplerItem item;
-
-    // Add all available devices so that inlined function can be placed.
-    for (const Device* d : device_set.devices()) {
-      Status added_device = item.AddDevice(d->name());
-      if (!added_device.ok()) VLOG(3) << added_device.error_message();
-    }
-
-    // Add fetches so that the graph can be pruned.
-    for (Node* node : ret_nodes) {
-      item.fetch.push_back(node->name());
-    }
-
-    (*graph)->ToGraphDef(&item.graph);
-
-    if (flib) {
-      *item.graph.mutable_library() = flib->ToProto();
-    }
-
-    tensorflow::GraphDef out_graph;
-
-    tensorflow::grappler::VirtualCluster cluster(&device_set);
-
-    // TODO(nareshmodi): Consider adding and using the more generic GraphOptions
-    // proto (which also contain the OptimizerOptions).
-    TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
-        item, config_proto_, cpu_device, &cluster, &out_graph));
-
-    std::unique_ptr<Graph> optimized_graph(new Graph(OpRegistry::Global()));
-    TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(
-        GraphConstructorOptions(), out_graph, optimized_graph.get()));
-
-    // Copy optimized functions back to the overlay lib.
-    if (flib) {
-      for (const FunctionDef& fdef : out_graph.library().function()) {
-        const string& func_name = fdef.signature().name();
-        if (flib->Contains(func_name)) {
-          TF_RETURN_IF_ERROR(flib->ReplaceFunction(func_name, fdef));
-        } else {
-          TF_RETURN_IF_ERROR(flib->AddFunctionDef(fdef));
-        }
-      }
-    }
-
-    *graph = std::move(optimized_graph);
-
-    // The graph conversion sets the requested device names but not the
-    // assigned device names. However, since at this point the graph is
-    // placed TF expects an assigned device name for every node. Therefore
-    // we copy the requested device into the assigned device field.
-    for (Node* node : graph->get()->nodes()) {
-      node->set_assigned_device_name(node->requested_device());
-    }
-
-    return Status::OK();
+    run_opts.rendezvous = rendez;
+
+    std::vector<Tensor>* rets = new std::vector<Tensor>;
+    const string& func_name = func_.name();
+    lib->Run(run_opts, handle, inputs, rets,
+             [rets, rendez, done, ctx, func_name](const Status& status) {
+               if (!status.ok()) {
+                 const string function_and_msg =
+                     strings::StrCat(errors::FormatFunctionForError(func_name),
+                                     " ", status.error_message());
+                 ctx->SetStatus(Status(status.code(), function_and_msg));
+               } else {
+                 for (int i = 0; i < rets->size(); ++i) {
+                   ctx->set_output(i, (*rets)[i]);
+                 }
+               }
+               delete rets;
+               rendez->Unref();
+               done();
+             });
   }
 
   NameAttrList func_;
   ConfigProto config_proto_;
   string executor_type_;
-  // Contains maps from device names to handles of function partitions, keyed by
-  // FunctionLibraryRuntime pointers. (Because this kernel may be instantiated
-  // for a stateful op, different invocations of it may use different
-  // FLRs. Different device placements of PartitionedCallOp also use different
-  // FLRs, and we use this to set the "default" device for the function to
-  // PartitionedCallOp's device.)
-  gtl::FlatMap<FunctionLibraryRuntime*,
-               std::unique_ptr<gtl::FlatMap<string, FHandle>>>
-      function_handles_ GUARDED_BY(mu_);
-  // Function partitions are added to overlay libraries.
-  gtl::FlatMap<FunctionLibraryRuntime*,
-               std::unique_ptr<FunctionLibraryDefinition>>
-      overlay_libs_ GUARDED_BY(mu_);
-  // Map from device name to the indices of the arguments and return values
-  // placed on that device. Read-only after the first invocation.
-  gtl::FlatMap<string, ArgAndRetIndices> arg_and_ret_indices_;
-  // Map from device name to alloc attrs for arguments and return values of the
-  // function placed on that device. Read-only after the first invocation.
-  gtl::FlatMap<string, ArgAndRetAllocAttrs> arg_and_ret_alloc_attrs_;
-
   mutex mu_;
-
-  // Used to uniquify function names in `overlay_libs_`.
-  uint32 suffix_ = 0;
+  // Cache the handle per FLR because this kernel may be instantiated for
+  // a stateful op, different invocations of it may use different FLRs.
+  // Different device placements of PartitionedCallOp also use
+  // different FLRs.
+  gtl::FlatMap<FunctionLibraryRuntime*, FunctionLibraryRuntime::Handle> handles_
+      GUARDED_BY(mu_);
 };
+
 REGISTER_KERNEL_BUILDER(Name("PartitionedCall").Device(DEVICE_CPU),
                         PartitionedCallOp);
 REGISTER_KERNEL_BUILDER(Name("StatefulPartitionedCall").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index e583f7feb4df9605115cd16aec54d1f3e9bb8b9c..69122f467c8fcf3818ab69f3f96d00b9a6b3c245 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 
 #if GOOGLE_CUDA
+#include "cuda/include/cudnn.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
@@ -28,6 +29,20 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace {
+
+template <typename T>
+struct RawType {
+  using type = T;
+};
+
+template <>
+struct RawType<qint8> {
+  using type = int8;
+};
+
+}  // namespace
+
 PoolParameters::PoolParameters(OpKernelContext* context,
                                const std::vector<int32>& ksize,
                                const std::vector<int32>& stride,
@@ -156,7 +171,10 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
     return;
   }
 
-  /// For now, cudnn does not support NHWC format, so we need to convert it
+  int batch_size = params.tensor_in_batch;
+  int depth = params.depth;
+#if CUDNN_VERSION < 7300
+  /// Earlier versions do not support NHWC format, so we need to convert it
   /// to NCHW before calling cudnn. We need to get rid of this once it is done
   Tensor transformed_input;
   if (data_format == FORMAT_NHWC) {
@@ -181,7 +199,31 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
   } else {
     transformed_output = *tensor_out;
   }
-
+  se::dnn::DataLayout data_layout = se::dnn::DataLayout::kBatchDepthYX;
+#else
+  auto& transformed_input = tensor_in;
+  auto& transformed_output = *tensor_out;
+  se::dnn::DataLayout data_layout;
+  switch (data_format) {
+    case FORMAT_NHWC:
+      data_layout = se::dnn::DataLayout::kBatchYXDepth;
+      break;
+    case FORMAT_NCHW:
+      data_layout = se::dnn::DataLayout::kBatchDepthYX;
+      break;
+    case FORMAT_NCHW_VECT_C:
+      // NCHW_VECT_C is not supported by cudnnPoolingForward(), but can be
+      // emulated via NHWC.
+      data_layout = se::dnn::DataLayout::kBatchYXDepth;
+      batch_size *= depth;
+      depth = 4;
+      break;
+    default:
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument("Unsupported format: ",
+                                          ToString(data_format)));
+  }
+#endif
   /// Get ready to call cudnn
   se::dnn::PoolingDescriptor pooling_desc;
   pooling_desc.set_pooling_mode(pooling_mode)
@@ -194,23 +236,27 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
       .set_propagate_nans(propagate_nans);
 
   se::dnn::BatchDescriptor input_desc;
-  input_desc.set_count(params.tensor_in_batch)
+  input_desc.set_count(batch_size)
       .set_height(params.tensor_in_rows)
       .set_width(params.tensor_in_cols)
-      .set_feature_map_count(params.depth)
-      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+      .set_feature_map_count(depth)
+      .set_layout(data_layout);
 
   se::dnn::BatchDescriptor output_desc;
-  output_desc.set_count(params.tensor_in_batch)
+  output_desc.set_count(batch_size)
       .set_height(params.out_height)
       .set_width(params.out_width)
-      .set_feature_map_count(params.depth)
-      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+      .set_feature_map_count(depth)
+      .set_layout(data_layout);
+
+  auto input_data =
+      AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>(
+                         transformed_input.template flat<T>().data()),
+                     transformed_input.template flat<T>().size());
 
-  auto input_data = AsDeviceMemory(transformed_input.template flat<T>().data(),
-                                   transformed_input.template flat<T>().size());
   auto output_data =
-      AsDeviceMemory(transformed_output.template flat<T>().data(),
+      AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>(
+                         transformed_output.template flat<T>().data()),
                      transformed_output.template flat<T>().size());
 
   auto* stream = context->op_device_context()->stream();
@@ -222,15 +268,17 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
                     .ok();
   OP_REQUIRES(context, status,
               errors::Internal("cudnn PoolForward launch failed"));
-
+#if CUDNN_VERSION < 7300
   if (data_format == FORMAT_NHWC) {
     /// Transform the output data from NCHW back to NHWC
     auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
-    functor::NCHWToNHWC<GPUDevice, T, 4>()(
+    using RT = typename RawType<T>::type;
+    functor::NCHWToNHWC<GPUDevice, RT, 4>()(
         context->eigen_device<Device>(),
-        toConstTensor(transformed_output).template tensor<T, 4>(),
-        tensor_out->tensor<T, 4>());
+        toConstTensor(transformed_output).template tensor<RT, 4>(),
+        tensor_out->tensor<RT, 4>());
   }
+#endif
 }
 
 template <typename T>
@@ -388,6 +436,11 @@ void DnnPoolingGradOp<T>::Compute(
   template class DnnPoolingOp<T>; \
   template class DnnPoolingGradOp<T>;
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_DNN_OPS)
+
+#if CUDNN_VERSION >= 7300
+template class DnnPoolingOp<qint8>;
+#endif
+
 #undef DEFINE_DNN_OPS
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/priority_queue.h b/tensorflow/core/kernels/priority_queue.h
index 8e69b5b699065a8722f4e19acaf8b57a7e0b64ed..a719c518c3e9206020602e315d0b0e3be474bfd0 100644
--- a/tensorflow/core/kernels/priority_queue.h
+++ b/tensorflow/core/kernels/priority_queue.h
@@ -68,7 +68,7 @@ class PriorityQueue
   Status MatchesPriorityNodeDefTypes(const NodeDef& node_def) const;
   Status MatchesPriorityNodeDefShapes(const NodeDef& node_def) const;
 
-  int32 size() override {
+  int32 size() const override {
     mutex_lock lock(mu_);
     return queues_[0].size();
   }
diff --git a/tensorflow/core/kernels/random_shuffle_queue_op.cc b/tensorflow/core/kernels/random_shuffle_queue_op.cc
index 31e8ce944fef913fd241801f4931fcb4dfd2025c..02b9b022fdcb00b3d9f4f676be579abced5e720e 100644
--- a/tensorflow/core/kernels/random_shuffle_queue_op.cc
+++ b/tensorflow/core/kernels/random_shuffle_queue_op.cc
@@ -59,7 +59,7 @@ class RandomShuffleQueue : public TypedQueue<std::vector<PersistentTensor> > {
                       CallbackWithTuple callback) override;
   Status MatchesNodeDef(const NodeDef& node_def) override;
 
-  int32 size() override {
+  int32 size() const override {
     mutex_lock lock(mu_);
     return queues_[0].size();
   }
diff --git a/tensorflow/core/kernels/scan_ops_gpu.cu.cc b/tensorflow/core/kernels/scan_ops_gpu.h
similarity index 97%
rename from tensorflow/core/kernels/scan_ops_gpu.cu.cc
rename to tensorflow/core/kernels/scan_ops_gpu.h
index ed66c02dc584541ce4d5eb644630b678c1b05916..976b2215405105ece0a5d25c2684aa558b01d8a0 100644
--- a/tensorflow/core/kernels/scan_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/scan_ops_gpu.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_SCAN_OPS_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_SCAN_OPS_GPU_H_
+
 #if GOOGLE_CUDA
 
 #define EIGEN_USE_GPU
@@ -290,17 +293,8 @@ struct Scan<GPUDevice, Eigen::internal::ProdReducer<T>, T> {
 };
 
 }  // namespace functor
-
-#define DEFINE(REDUCER, T) template struct functor::Scan<GPUDevice, REDUCER, T>;
-
-#define DEFINE_FOR_ALL_REDUCERS(T)           \
-  DEFINE(Eigen::internal::SumReducer<T>, T); \
-  DEFINE(Eigen::internal::ProdReducer<T>, T);
-
-TF_CALL_GPU_NUMBER_TYPES(DEFINE_FOR_ALL_REDUCERS);
-#undef DEFINE_FOR_ALL_REDUCERS
-#undef DEFINE
-
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_SCAN_OPS_GPU_H_
diff --git a/tensorflow/core/kernels/scan_ops_gpu_double.cu.cc b/tensorflow/core/kernels/scan_ops_gpu_double.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..adce37e473c4f3f31b29db5b71c4d004da1b6b29
--- /dev/null
+++ b/tensorflow/core/kernels/scan_ops_gpu_double.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/scan_ops.h"
+#include "tensorflow/core/kernels/scan_ops_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+template struct functor::Scan<GpuDevice, Eigen::internal::SumReducer<double>,
+                              double>;
+template struct functor::Scan<GpuDevice, Eigen::internal::ProdReducer<double>,
+                              double>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/scan_ops_gpu_float.cu.cc b/tensorflow/core/kernels/scan_ops_gpu_float.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b72415822d0eebecf8426008266c5bd503b8830c
--- /dev/null
+++ b/tensorflow/core/kernels/scan_ops_gpu_float.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/scan_ops.h"
+#include "tensorflow/core/kernels/scan_ops_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+template struct functor::Scan<GpuDevice, Eigen::internal::SumReducer<float>,
+                              float>;
+template struct functor::Scan<GpuDevice, Eigen::internal::ProdReducer<float>,
+                              float>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/scan_ops_gpu_half.cu.cc b/tensorflow/core/kernels/scan_ops_gpu_half.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f9fb528be98efc722df3f8b76adc65ae7fa29cdb
--- /dev/null
+++ b/tensorflow/core/kernels/scan_ops_gpu_half.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/scan_ops.h"
+#include "tensorflow/core/kernels/scan_ops_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+template struct functor::Scan<
+    GpuDevice, Eigen::internal::SumReducer<Eigen::half>, Eigen::half>;
+template struct functor::Scan<
+    GpuDevice, Eigen::internal::ProdReducer<Eigen::half>, Eigen::half>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 5a14ab633048dd68bc35e4bd207eea034a57d901..1b1c59cf34d47d1b157d6665ff9f6c11968cabb5 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -351,7 +351,9 @@ class ScatterNdUpdateOp : public OpKernel {
   REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, "ScatterNdSub",            \
                                     scatter_nd_op::UpdateOp::SUB);        \
   REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL(                             \
-      type, dev, "ResourceScatterNdAdd", scatter_nd_op::UpdateOp::ADD);
+      type, dev, "ResourceScatterNdAdd", scatter_nd_op::UpdateOp::ADD);   \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL(                             \
+      type, dev, "ResourceScatterNdSub", scatter_nd_op::UpdateOp::SUB);
 
 #define REGISTER_SCATTER_ND(type, dev) \
   REGISTER_SCATTER_ND_KERNEL(type, dev, "ScatterNd");
diff --git a/tensorflow/core/kernels/sdca_internal.cc b/tensorflow/core/kernels/sdca_internal.cc
index a8e9b3261cd29191955509f34028660dff862bd7..2bb2c0d91e94b9462af330e806745cfb8317767a 100644
--- a/tensorflow/core/kernels/sdca_internal.cc
+++ b/tensorflow/core/kernels/sdca_internal.cc
@@ -26,6 +26,10 @@ limitations under the License.
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 namespace tensorflow {
 namespace sdca {
 
diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc
index 9db0bd4d98bdb9964cb561d96d91782ba3615a7f..21c3b89f548e30cff345a072ca2e11dfe15081b5 100644
--- a/tensorflow/core/kernels/sequence_ops.cc
+++ b/tensorflow/core/kernels/sequence_ops.cc
@@ -143,11 +143,12 @@ class LinSpaceOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, TensorShape({num}), &out));
     auto flat = out->flat<T>();
-    if (num == 1) {
-      flat(0) = start;
-    } else {
+    flat(0) = start;
+    if (num > 1) {
       const T step = (stop - start) / (num - 1);
-      for (Tnum i = 0; i < num; ++i) flat(i) = start + step * i;
+      for (Tnum i = 1; i < num - 1; ++i) flat(i) = start + step * i;
+      // Ensure final value == stop; float arithmetic won't guarantee this.
+      flat(num - 1) = stop;
     }
   }
 };
diff --git a/tensorflow/core/kernels/sequence_ops_test.cc b/tensorflow/core/kernels/sequence_ops_test.cc
index 5f0e0a69a890aafa56b43cc55e99f490c100faa7..2247c447500693942ebaeda33eb5cd2baf7d226a 100644
--- a/tensorflow/core/kernels/sequence_ops_test.cc
+++ b/tensorflow/core/kernels/sequence_ops_test.cc
@@ -114,6 +114,27 @@ TEST_F(LinSpaceOpTest, Simple_D32) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
+TEST_F(LinSpaceOpTest, Exact_Endpoints) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run. The particular values 0., 1., and 42 are chosen to test that
+  // the last value is not calculated via an intermediate delta as (1./41)*41,
+  // because for IEEE 32-bit floats that returns 0.99999994 != 1.0.
+  AddInputFromArray<float>(TensorShape({}), {0.0});
+  AddInputFromArray<float>(TensorShape({}), {1.0});
+  AddInputFromArray<int32>(TensorShape({}), {42});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output
+  Tensor output = *GetOutput(0);
+  float expected_start = 0.0;
+  float start = output.flat<float>()(0);
+  EXPECT_EQ(expected_start, start) << expected_start << " vs. " << start;
+  float expected_stop = 1.0;
+  float stop = output.flat<float>()(output.NumElements() - 1);
+  EXPECT_EQ(expected_stop, stop) << expected_stop << " vs. " << stop;
+}
+
 TEST_F(LinSpaceOpTest, Single_D64) {
   MakeOp(DT_FLOAT, DT_INT64);
 
diff --git a/tensorflow/core/kernels/softmax_op_functor.h b/tensorflow/core/kernels/softmax_op_functor.h
index c8bc1ad3bbb60e147dbb1d8fdf3c988b395ea19d..218698f3fff89166c0440195de25295dfe0028ab 100644
--- a/tensorflow/core/kernels/softmax_op_functor.h
+++ b/tensorflow/core/kernels/softmax_op_functor.h
@@ -57,7 +57,6 @@ struct SoftmaxEigenImpl {
     Eigen::DSizes<int, 2> one_by_class(1, num_classes);
 #else
     Eigen::IndexList<Eigen::type2index<kClassDim> > along_class;
-    Eigen::IndexList<Eigen::type2index<1> > depth_dim;
     Eigen::IndexList<int, Eigen::type2index<1> > batch_by_one;
     batch_by_one.set(0, batch_size);
     Eigen::IndexList<Eigen::type2index<1>, int> one_by_class;
diff --git a/tensorflow/core/kernels/sparse_tensors_map_ops.cc b/tensorflow/core/kernels/sparse_tensors_map_ops.cc
index 74fa3a15f06fdb267dc9776ee8a0903f8f6626de..939638b37058bf8294ebc437c6c14dbb696a8aa8 100644
--- a/tensorflow/core/kernels/sparse_tensors_map_ops.cc
+++ b/tensorflow/core/kernels/sparse_tensors_map_ops.cc
@@ -43,7 +43,7 @@ class SparseTensorsMap : public ResourceBase {
  public:
   explicit SparseTensorsMap(const string& name) : name_(name), counter_(0) {}
 
-  string DebugString() override { return "A SparseTensorsMap"; }
+  string DebugString() const override { return "A SparseTensorsMap"; }
 
   typedef struct {
     PersistentTensor indices;
diff --git a/tensorflow/core/kernels/sparse_xent_op.cc b/tensorflow/core/kernels/sparse_xent_op.cc
index f84ffd53238f7753c1b4562268be9058c6c03e6d..37d4d0661cadc1d86af10c8226e4aae52b4b8c7c 100644
--- a/tensorflow/core/kernels/sparse_xent_op.cc
+++ b/tensorflow/core/kernels/sparse_xent_op.cc
@@ -90,9 +90,8 @@ class SparseSoftmaxXentWithLogitsOp : public OpKernel {
             context, CheckInvalidLabelIndex<Index>(labels, logits.dim_size(1)));
       }
       functor::SparseXentFunctor<Device, T, Index> functor;
-      functor(context->eigen_device<Device>(), logits.matrix<T>(),
-              labels.vec<Index>(), scratch.vec<T>(), loss_out->vec<T>(),
-              back_out->matrix<T>());
+      functor(context, logits.matrix<T>(), labels.vec<Index>(),
+              scratch.vec<T>(), loss_out->vec<T>(), back_out->matrix<T>());
     }
   }
 };
@@ -102,11 +101,11 @@ class SparseSoftmaxXentWithLogitsOp : public OpKernel {
 namespace functor {
 template <typename T, typename Index>
 struct SparseXentFunctor<CPUDevice, T, Index> {
-  void operator()(const CPUDevice& d, typename TTypes<T>::ConstMatrix logits,
+  void operator()(OpKernelContext* ctx, typename TTypes<T>::ConstMatrix logits,
                   typename TTypes<Index>::ConstVec labels,
                   typename TTypes<T>::Vec scratch, typename TTypes<T>::Vec loss,
                   typename TTypes<T>::Matrix backprop) {
-    SparseXentEigenImpl<CPUDevice, T, Index>::Compute(d, logits, labels,
+    SparseXentEigenImpl<CPUDevice, T, Index>::Compute(ctx, logits, labels,
                                                       scratch, loss, backprop);
   }
 };
diff --git a/tensorflow/core/kernels/sparse_xent_op.h b/tensorflow/core/kernels/sparse_xent_op.h
index 6ba7931ab5f923cec2efa44fb44e2b3a91f73ebe..5e462424ed8a54de417933b0ecc0b08e0bbe1f02 100644
--- a/tensorflow/core/kernels/sparse_xent_op.h
+++ b/tensorflow/core/kernels/sparse_xent_op.h
@@ -18,6 +18,7 @@ limitations under the License.
 // Functor definition for SparseXentOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/macros.h"
@@ -128,6 +129,26 @@ class SparseXentGradGenerator {
 
 namespace functor {
 
+template <typename Device, typename T>
+struct RowMaxReduction {
+  // Computes the maximum across the rows of logits
+  //
+  // logits: batch_size, num_classes.
+  // maximum: temporary tensor, dims: batch_size, 1
+  static inline void Compute(OpKernelContext* ctx,
+                             typename TTypes<T>::ConstMatrix logits,
+                             typename TTypes<T>::Vec maximum) {
+#if !defined(EIGEN_HAS_INDEX_LIST)
+    Eigen::array<int, 1> along_row;
+    along_row[0] = 1;
+#else
+    Eigen::IndexList<Eigen::type2index<1> > along_row;
+#endif
+    Device d = ctx->eigen_device<Device>();
+    To32Bit(maximum).device(d) = To32Bit(logits).maximum(along_row);
+  }
+};
+
 // Functor used by SparseXentOp to do the computations.
 template <typename Device, typename T, typename Index>
 struct SparseXentFunctor {
@@ -138,7 +159,7 @@ struct SparseXentFunctor {
   // scratch: temporary tensor, dims: batch_size, 1
   // loss: output tensor for the loss, dims: batch_size.
   // backprop: output tensor for the backprop, dims: batch_size, num_classes.
-  void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits,
+  void operator()(OpKernelContext* ctx, typename TTypes<T>::ConstMatrix logits,
                   typename TTypes<Index>::ConstVec labels,
                   typename TTypes<T>::Vec scratch, typename TTypes<T>::Vec loss,
                   typename TTypes<T>::Matrix backprop);
@@ -149,7 +170,8 @@ struct SparseXentFunctor {
 // specializations for both device types.
 template <typename Device, typename T, typename Index>
 struct SparseXentEigenImpl {
-  static void Compute(const Device& d, typename TTypes<T>::ConstMatrix logits,
+  static void Compute(OpKernelContext* ctx,
+                      typename TTypes<T>::ConstMatrix logits,
                       typename TTypes<Index>::ConstVec labels,
                       typename TTypes<T>::Vec scratch,
                       typename TTypes<T>::Vec loss,
@@ -188,8 +210,9 @@ struct SparseXentEigenImpl {
 #endif
 
     // scratch = max_logits along classes.
-    To32Bit(scratch).device(d) = To32Bit(logits).maximum(along_class);
+    RowMaxReduction<Device, T>::Compute(ctx, logits, scratch);
 
+    Device d = ctx->eigen_device<Device>();
     // backprop = logits - max_logits.
     To32Bit(backprop).device(d) =
         To32Bit(logits) -
diff --git a/tensorflow/core/kernels/sparse_xent_op_gpu.cu.cc b/tensorflow/core/kernels/sparse_xent_op_gpu.cu.cc
index d0539660282240bd40495a5078771d1f7a1f3211..5fe15352c3e562eff0fee5dd43fb8625f4c27fa5 100644
--- a/tensorflow/core/kernels/sparse_xent_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/sparse_xent_op_gpu.cu.cc
@@ -20,22 +20,50 @@ limitations under the License.
 #include "tensorflow/core/kernels/sparse_xent_op.h"
 
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/reduction_gpu_kernels.cu.h"
+#include "tensorflow/core/kernels/reduction_ops_common.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
+namespace functor {
+
+// Partial specialization for a GPUDevice, that uses the CUB implementation
+// from reduction_gpu_kernels.cu.h.
+template <typename T>
+struct RowMaxReduction<GPUDevice, T> {
+  // Computes the maximum across the rows of logits
+  //
+  // logits: batch_size, num_classes.
+  // maximum: temporary tensor, dims: batch_size, 1
+  static inline void Compute(OpKernelContext* ctx,
+                             typename TTypes<T>::ConstMatrix logits,
+                             typename TTypes<T>::Vec maximum) {
+    const int kBatchDim = 0;
+    const int kClassDim = 1;
+    const int rows = logits.dimension(kBatchDim);
+    const int cols = logits.dimension(kClassDim);
+
+    typedef const Eigen::array<TTypes<float>::Tensor::Index, 1>& ReductionAxes;
+    Constants<GPUDevice> constants;
+    cub::Max op;
+    functor::ReduceImpl<T, cub::Max, T*, const T*, ReductionAxes>(
+        ctx, maximum.data(), logits.data(), 2, rows, cols, 1, 1, constants.kOne,
+        op);
+  }
+};
+
 // Partial specialization for a GPUDevice, that uses the Eigen implementation
 // from XentEigenImpl.
-namespace functor {
 template <typename T, typename Index>
 struct SparseXentFunctor<GPUDevice, T, Index> {
-  void operator()(const GPUDevice& d, typename TTypes<T>::ConstMatrix logits,
+  void operator()(OpKernelContext* ctx, typename TTypes<T>::ConstMatrix logits,
                   typename TTypes<Index>::ConstVec labels,
                   typename TTypes<T>::Vec scratch, typename TTypes<T>::Vec loss,
                   typename TTypes<T>::Matrix backprop) {
-    SparseXentEigenImpl<GPUDevice, T, Index>::Compute(d, logits, labels,
+    SparseXentEigenImpl<GPUDevice, T, Index>::Compute(ctx, logits, labels,
                                                       scratch, loss, backprop);
   }
 };
diff --git a/tensorflow/core/kernels/stack.cc b/tensorflow/core/kernels/stack.cc
index 5c70a2d62d36b94362c6f10473644f2623b77d2a..2af6b4b8148807df9e1f7c0de65f664efe6acc79 100644
--- a/tensorflow/core/kernels/stack.cc
+++ b/tensorflow/core/kernels/stack.cc
@@ -96,7 +96,7 @@ class Stack : public ResourceBase {
 
   DataType ElemType() { return elem_type_; }
 
-  string DebugString() override {
+  string DebugString() const override {
     mutex_lock l(mu_);
     return strings::StrCat("Stack[", stack_name_, "]");
   }
diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc
index c91bdc43cf4636481f141df70f30b1f2d74dc1a2..65174e163c1031d3e480159824f984e4bf83980b 100644
--- a/tensorflow/core/kernels/stage_op.cc
+++ b/tensorflow/core/kernels/stage_op.cc
@@ -132,7 +132,7 @@ class Buffer : public ResourceBase {
     notify_inserters_if_bounded(&lock);
   }
 
-  string DebugString() override {
+  string DebugString() const override {
     std::unique_lock<std::mutex> lock(mu_);
     return strings::StrCat("Staging size: ", buf_.size());
   }
@@ -170,7 +170,7 @@ class Buffer : public ResourceBase {
   std::size_t capacity_;
   std::size_t memory_limit_;
   std::size_t current_bytes_;
-  std::mutex mu_;
+  mutable std::mutex mu_;
   std::condition_variable non_empty_cond_var_;
   std::condition_variable full_cond_var_;
   std::deque<Tuple> buf_;
diff --git a/tensorflow/core/kernels/summary_kernels.cc b/tensorflow/core/kernels/summary_kernels.cc
index b287f0cc2f1337cff5705b5a40ba455b837307f9..5e3465d1dd6ce24a82525704f5223b6d9f0ac39f 100644
--- a/tensorflow/core/kernels/summary_kernels.cc
+++ b/tensorflow/core/kernels/summary_kernels.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorboard/db/schema.h"
-#include "tensorflow/contrib/tensorboard/db/summary_db_writer.h"
-#include "tensorflow/contrib/tensorboard/db/summary_file_writer.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/db/sqlite.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/summary/schema.h"
+#include "tensorflow/core/summary/summary_db_writer.h"
+#include "tensorflow/core/summary/summary_file_writer.h"
 #include "tensorflow/core/util/event.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/summary_op.cc b/tensorflow/core/kernels/summary_op.cc
index 1f4e3418f4826dee789002d4aa688f8ce14e17d2..1053aa7d53ad5f831f8127036d8156cdde772b70 100644
--- a/tensorflow/core/kernels/summary_op.cc
+++ b/tensorflow/core/kernels/summary_op.cc
@@ -124,7 +124,9 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER)
 struct HistogramResource : public ResourceBase {
   histogram::ThreadSafeHistogram histogram;
 
-  string DebugString() override { return "A histogram summary. Stats ..."; }
+  string DebugString() const override {
+    return "A histogram summary. Stats ...";
+  }
 };
 
 class SummaryMergeOp : public OpKernel {
diff --git a/tensorflow/core/kernels/tensor_array.h b/tensorflow/core/kernels/tensor_array.h
index 384a63e945306637bcf074d1f3709eea055bffe9..507ab459ca5ee773e7fa3f3c77dc511a55957dd0 100644
--- a/tensorflow/core/kernels/tensor_array.h
+++ b/tensorflow/core/kernels/tensor_array.h
@@ -261,7 +261,7 @@ class TensorArray : public ResourceBase {
     return Status::OK();
   }
 
-  string DebugString() override {
+  string DebugString() const override {
     mutex_lock l(mu_);
     CHECK(!closed_);
     return strings::StrCat("TensorArray[", tensors_.size(), "]");
@@ -376,7 +376,7 @@ class TensorArray : public ResourceBase {
   const DataType dtype_;
   Tensor handle_;
 
-  mutex mu_;
+  mutable mutex mu_;
 
   // Marks that the tensor_array_ has been cleared.
   bool closed_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/tensor_forest/BUILD b/tensorflow/core/kernels/tensor_forest/BUILD
index df035506f7698d1d213efad6088e9bfb53d97282..0060410c95787fb69d206b646afd66c31a821f05 100644
--- a/tensorflow/core/kernels/tensor_forest/BUILD
+++ b/tensorflow/core/kernels/tensor_forest/BUILD
@@ -27,7 +27,6 @@ tf_kernel_library(
         ":resources",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:tensor_forest_ops_op_lib",
         "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
     ],
 )
@@ -39,7 +38,6 @@ tf_kernel_library(
         ":resources",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:tensor_forest_ops_op_lib",
         "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
     ],
 )
diff --git a/tensorflow/core/kernels/tensor_forest/resources.h b/tensorflow/core/kernels/tensor_forest/resources.h
index da258e5017ca8cc9b996d83bcd767e89d61322d7..f0a78f97264336acc9ba293d6547cc0fe10343ee 100644
--- a/tensorflow/core/kernels/tensor_forest/resources.h
+++ b/tensorflow/core/kernels/tensor_forest/resources.h
@@ -34,7 +34,7 @@ class TensorForestTreeResource : public ResourceBase {
  public:
   TensorForestTreeResource();
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat("TensorForestTree[size=", get_size(), "]");
   }
 
diff --git a/tensorflow/core/kernels/tile_functor_gpu.cu.cc b/tensorflow/core/kernels/tile_functor_gpu.h
similarity index 85%
rename from tensorflow/core/kernels/tile_functor_gpu.cu.cc
rename to tensorflow/core/kernels/tile_functor_gpu.h
index 84a5060fc3cd17c09b905d606dba62bbaa7f1373..0de32e730ed858ccc3dfcbacb65a7cf922aa5ce2 100644
--- a/tensorflow/core/kernels/tile_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/tile_functor_gpu.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_
+
 #if GOOGLE_CUDA
 
 #define EIGEN_USE_GPU
@@ -80,28 +83,7 @@ void TileSimple(const Device& d, Tensor* out, const Tensor& in) {
 }
 
 }  // end namespace internal
-
-namespace functor {
-
-typedef Eigen::GpuDevice GPUDevice;
-
-// Register functors used for Tile functor.
-#define DEFINE_TYPE(T)                       \
-  template struct Tile<GPUDevice, T, int32>; \
-  template struct Tile<GPUDevice, T, int64>;
-
-TF_CALL_bool(DEFINE_TYPE);
-TF_CALL_int16(DEFINE_TYPE);
-TF_CALL_int32(DEFINE_TYPE);
-TF_CALL_int64(DEFINE_TYPE);
-TF_CALL_float(DEFINE_TYPE);
-TF_CALL_double(DEFINE_TYPE);
-TF_CALL_half(DEFINE_TYPE);
-TF_CALL_complex64(DEFINE_TYPE);
-TF_CALL_complex128(DEFINE_TYPE);
-
-#undef DEFINE_TYPE
-
-}  // end namespace functor
 }  // namespace tensorflow
 #endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_
diff --git a/tensorflow/core/kernels/tile_functor_gpu_bool.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_bool.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c7a814c7a2c4de5964deb2eff875235f293cd7b0
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_bool.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, bool, int32>;
+template struct Tile<GpuDevice, bool, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_functor_gpu_complex128.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_complex128.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4dfa4bac1b6a08acc4c8eed18785785b3e4d6071
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_complex128.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, complex128, int32>;
+template struct Tile<GpuDevice, complex128, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_functor_gpu_complex64.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_complex64.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..525ede938fd6d31df514ad9f6c049d62f8c25740
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_complex64.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, complex64, int32>;
+template struct Tile<GpuDevice, complex64, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_functor_gpu_double.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_double.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..25e024083e3d3ed44af51f1ff1ae2fb1305be526
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_double.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, double, int32>;
+template struct Tile<GpuDevice, double, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_functor_gpu_float.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_float.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f0f31370e43cdd3e06aadfe6daf0eb988cfd6ce4
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_float.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, float, int32>;
+template struct Tile<GpuDevice, float, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_functor_gpu_half.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_half.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c3810a0bc63de50360845e5c56a693ebff56c2e
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_half.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, Eigen::half, int32>;
+template struct Tile<GpuDevice, Eigen::half, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_functor_gpu_int16.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_int16.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2280dcbc82d320586ca262c8c372970a70958f27
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_int16.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, int16, int32>;
+template struct Tile<GpuDevice, int16, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_functor_gpu_int32.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_int32.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b05403badae96d24fde13c1532eb32ab67695d06
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_int32.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, int32, int32>;
+template struct Tile<GpuDevice, int32, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_functor_gpu_int64.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_int64.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2d83c6b3a1c2257b47ab978767713e9d93d22323
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_int64.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, int64, int32>;
+template struct Tile<GpuDevice, int64, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu.cu.cc b/tensorflow/core/kernels/topk_op_gpu.h
similarity index 98%
rename from tensorflow/core/kernels/topk_op_gpu.cu.cc
rename to tensorflow/core/kernels/topk_op_gpu.h
index 2fbe1fe7cbb5ad0d90dfcb651fdbb8359c7c1d69..6f3bec20f6919e3257fd823699afd23e3ccc0653 100644
--- a/tensorflow/core/kernels/topk_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_TOPK_OP_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_TOPK_OP_GPU_H_
 
 #if GOOGLE_CUDA
 
@@ -561,14 +563,8 @@ struct TopKFunctor<GPUDevice, T> {
 };
 
 }  // end namespace functor
-
-#define INSTANTIATE_TEMPLATE(type) \
-  template struct functor::TopKFunctor<GPUDevice, type>;
-
-TF_CALL_GPU_NUMBER_TYPES(INSTANTIATE_TEMPLATE);
-TF_CALL_INTEGRAL_TYPES(INSTANTIATE_TEMPLATE);
-#undef INSTANTIATE_TEMPLATE
-
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_TOPK_OP_GPU_H_
diff --git a/tensorflow/core/kernels/topk_op_gpu_double.cu.cc b/tensorflow/core/kernels/topk_op_gpu_double.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8a5a7e71b1b3126335acd75d1061b816046a18b7
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_double.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, double>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_float.cu.cc b/tensorflow/core/kernels/topk_op_gpu_float.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0b69396bb13dc4414e07e742c7ed90b03fc3df51
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_float.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, float>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_half.cu.cc b/tensorflow/core/kernels/topk_op_gpu_half.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e53586aeca2d00c1d6e6e75fad9538abc8ba1d6a
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_half.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, Eigen::half>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_int16.cu.cc b/tensorflow/core/kernels/topk_op_gpu_int16.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5bd310523c98d33cadd6324296468629f0dbec4b
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_int16.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, int16>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_int32.cu.cc b/tensorflow/core/kernels/topk_op_gpu_int32.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..55b393a0c02b15c4bce08994e1d8a4e82684d97b
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_int32.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, int32>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_int64.cu.cc b/tensorflow/core/kernels/topk_op_gpu_int64.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3e4a775056310d2e58d8f339bcace213741ef699
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_int64.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, int64>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_int8.cu.cc b/tensorflow/core/kernels/topk_op_gpu_int8.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ac73cd170b8fbd956921120ac106b0b1813b1605
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_int8.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, int8>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8d5f8ceb06d171c43cf25e59fe47602f4410977f
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, uint16>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint8.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint8.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fc1a8a2c8cca11e52d2b9eb53c269cc78e44b3d1
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_uint8.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, uint8>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
index eadea18f760b6109c6c10700285a2a2e54e4b083..00994bbe8e7142f0c8ca7a31aef7f0a540b48824 100644
--- a/tensorflow/core/kernels/variable_ops.cc
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -35,7 +35,7 @@ class LegacyVar : public ResourceBase {
   mutex* mu() { return &mu_; }
   Tensor* tensor() { return &tensor_; }
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat(DataTypeString(tensor_.dtype()), "/",
                            tensor_.shape().DebugString());
   }
@@ -116,7 +116,7 @@ class TemporaryVariableOp : public OpKernel {
     mutex mu;
     Tensor val;
     string name;
-    string DebugString() override { return name; }
+    string DebugString() const override { return name; }
     ~TmpVar() override { VLOG(3) << "TmpVar " << name << " deleted"; }
   };
 
diff --git a/tensorflow/core/kernels/where_op.cc b/tensorflow/core/kernels/where_op.cc
index 3330442ffd602c7293a4ddc3c675524698364c4e..aa65fa6b637e3077c456b3c724effc759c26c7dd 100644
--- a/tensorflow/core/kernels/where_op.cc
+++ b/tensorflow/core/kernels/where_op.cc
@@ -137,8 +137,10 @@ class WhereCPUOp : public OpKernel {
     const int input_dims = input.dims();
 
     Tensor num_true;
-    OP_REQUIRES_OK(
-        context, context->allocate_temp(DT_INT64, TensorShape({}), &num_true));
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    OP_REQUIRES_OK(context, context->allocate_temp(DT_INT64, TensorShape({}),
+                                                   &num_true, attr));
     auto num_true_t = num_true.scalar<int64>();
 
     Status s = functor::NumTrue<CPUDevice, T, int64>::Compute(
@@ -368,6 +370,12 @@ class WhereGPUOp : public AsyncOpKernel {
       Name("Where").Device(DEVICE_GPU).TypeConstraint<T>("T"), WhereGPUOp<T>);
 
 TF_CALL_WHERE_GPU_TYPES(REGISTER_GPU_WHERE_OP);
+REGISTER_KERNEL_BUILDER(Name("Where")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int32>("T")
+                            .HostMemory("input")
+                            .HostMemory("index"),
+                        WhereCPUOp<int32>);
 
 #undef REGISTER_GPU_WHERE_OP
 
diff --git a/tensorflow/core/kernels/where_op.h b/tensorflow/core/kernels/where_op.h
index e63b3ba8cde5e284a8ef7664a4453fef343cdfa2..7297d37ffb8fc19dd924a4396b110b4e87bf795c 100644
--- a/tensorflow/core/kernels/where_op.h
+++ b/tensorflow/core/kernels/where_op.h
@@ -27,7 +27,6 @@ namespace tensorflow {
 #define TF_CALL_WHERE_GPU_TYPES(m) \
   TF_CALL_int8(m);                 \
   TF_CALL_uint8(m);                \
-  TF_CALL_int32(m);                \
   TF_CALL_int64(m);                \
   TF_CALL_float(m);                \
   TF_CALL_double(m);               \
diff --git a/tensorflow/core/lib/core/errors.h b/tensorflow/core/lib/core/errors.h
index d5cbe6c61674b80978ec16d5c00d3747b667e1f5..4815f7c2cc6c4197c4dbd6017213e275d38b105e 100644
--- a/tensorflow/core/lib/core/errors.h
+++ b/tensorflow/core/lib/core/errors.h
@@ -150,6 +150,10 @@ string FormatColocationNodeForError(const T& names) {
       });
 }
 
+inline string FormatFunctionForError(const string& name) {
+  return strings::StrCat("{{function_node ", name, "}}");
+}
+
 // The CanonicalCode() for non-errors.
 using ::tensorflow::error::OK;
 
diff --git a/tensorflow/core/lib/core/status.cc b/tensorflow/core/lib/core/status.cc
index 3076c0933739943007d9da4f34da7b6399a7be7c..0b63f66f6da0792b0cdba23ea3e5a4abba5e4bdc 100644
--- a/tensorflow/core/lib/core/status.cc
+++ b/tensorflow/core/lib/core/status.cc
@@ -200,7 +200,7 @@ Status StatusGroup::as_status() const {
          const std::pair<error::Code, int>& b) { return a.second < b.second; });
 
   fmt.push_back(
-      strings::Printf("Combined status information from %lu operations:\n",
+      strings::Printf("Combined status information from %zu operations:\n",
                       num_ok_ + children_.size()));
 
   for (const auto& p : count_vec) {
diff --git a/tensorflow/core/lib/gif/gif_io.cc b/tensorflow/core/lib/gif/gif_io.cc
index 9a5215320f58d10c22872c2837e882bed82f5b52..3fad8c8b14249bebd8f88a1d94a2c338cd076965 100644
--- a/tensorflow/core/lib/gif/gif_io.cc
+++ b/tensorflow/core/lib/gif/gif_io.cc
@@ -82,9 +82,20 @@ uint8* Decode(const void* srcdata, int datasize,
     return nullptr;
   }
 
+  // Don't request more memory than needed for each frame, preventing OOM
+  int max_frame_width = 0;
+  int max_frame_height = 0;
+  for (int k = 0; k < gif_file->ImageCount; k++) {
+    SavedImage* si = &gif_file->SavedImages[k];
+    if (max_frame_height < si->ImageDesc.Height)
+      max_frame_height = si->ImageDesc.Height;
+    if (max_frame_width < si->ImageDesc.Width)
+      max_frame_width = si->ImageDesc.Width;
+  }
+
   const int num_frames = gif_file->ImageCount;
-  const int width = gif_file->SWidth;
-  const int height = gif_file->SHeight;
+  const int width = max_frame_width;    // gif_file->SWidth;
+  const int height = max_frame_height;  // gif_file->SHeight;
   const int channel = 3;
 
   uint8* const dstdata = allocate_output(num_frames, width, height, channel);
diff --git a/tensorflow/core/lib/gtl/stl_util.h b/tensorflow/core/lib/gtl/stl_util.h
index ffeca4e88a93936ee6a1711afec735d97d04172e..853a290bf6383c679ddc9c00dbce38d18d3d35b6 100644
--- a/tensorflow/core/lib/gtl/stl_util.h
+++ b/tensorflow/core/lib/gtl/stl_util.h
@@ -23,9 +23,12 @@ limitations under the License.
 #include <iterator>
 #include <memory>
 #include <string>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
+#include "absl/meta/type_traits.h"
+
 namespace tensorflow {
 namespace gtl {
 
@@ -48,16 +51,38 @@ inline const T* vector_as_array(const std::vector<T, Allocator>* v) {
   return v->data();
 }
 
+namespace gtl_internal {
+
+// HasMember is true_type or false_type, depending on whether or not
+// T has a __resize_default_init member. Resize will call the
+// __resize_default_init member if it exists, and will call the resize
+// member otherwise.
+template <typename string_type, typename = void>
+struct ResizeUninitializedTraits {
+  using HasMember = std::false_type;
+  static void Resize(string_type* s, size_t new_size) { s->resize(new_size); }
+};
+
+// __resize_default_init is provided by libc++ >= 8.0 and by Google's internal
+// ::string implementation.
+template <typename string_type>
+struct ResizeUninitializedTraits<
+    string_type, absl::void_t<decltype(std::declval<string_type&>()
+                                           .__resize_default_init(237))> > {
+  using HasMember = std::true_type;
+  static void Resize(string_type* s, size_t new_size) {
+    s->__resize_default_init(new_size);
+  }
+};
+
+}  // namespace gtl_internal
+
 // Like str->resize(new_size), except any new characters added to "*str" as a
 // result of resizing may be left uninitialized, rather than being filled with
 // '0' bytes. Typically used when code is then going to overwrite the backing
-// store of the string with known data. Uses a Google extension to ::string.
+// store of the string with known data.
 inline void STLStringResizeUninitialized(string* s, size_t new_size) {
-#if __google_stl_resize_uninitialized_string
-  s->resize_uninitialized(new_size);
-#else
-  s->resize(new_size);
-#endif
+  gtl_internal::ResizeUninitializedTraits<string>::Resize(s, new_size);
 }
 
 // Calls delete (non-array version) on the SECOND item (pointer) in each pair in
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 6e571305d6b577f50c17b4e9712073a96d4a5de6..9698673dfe81757cbea21f4a588c137384f7b373 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -13631,6 +13631,151 @@ op {
     }
   }
 }
+op {
+  name: "Conv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
 op {
   name: "Conv2DBackpropFilter"
   input_arg {
@@ -13655,6 +13800,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -13693,6 +13839,18 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
   name: "Conv2DBackpropFilter"
@@ -13720,6 +13878,7 @@ op {
         type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
@@ -13818,6 +13977,15 @@ op {
       list {
         s: "SAME"
         s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
       }
     }
   }
@@ -14063,6 +14231,92 @@ op {
     }
   }
 }
+op {
+  name: "Conv2DBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
 op {
   name: "Conv3D"
   input_arg {
@@ -53199,6 +53453,43 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceScatterNdSub"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceScatterNdUpdate"
   input_arg {
@@ -76588,6 +76879,34 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorListConcat"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+}
 op {
   name: "TensorListConcatLists"
   input_arg {
@@ -76792,6 +77111,22 @@ op {
     }
   }
 }
+op {
+  name: "TensorListResize"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
 op {
   name: "TensorListScatter"
   input_arg {
diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc
index 01ebcd15439d670274d7e2a784ce78c5c1ee44ef..cbc9c7a2f4589924929c8ca6c16b85c04566d620 100644
--- a/tensorflow/core/ops/list_ops.cc
+++ b/tensorflow/core/ops/list_ops.cc
@@ -212,10 +212,16 @@ REGISTER_OP("TensorListConcat")
     .Output("tensor: element_dtype")
     .Output("lengths: int64")
     .Attr("element_dtype: type")
+    .Attr("element_shape: shape = { unknown_rank: true }")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       DataType element_dtype;
       TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
-      shape_inference::ShapeHandle element_shape = c->UnknownShape();
+      PartialTensorShape raw_element_shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_shape", &raw_element_shape));
+      shape_inference::ShapeHandle element_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(raw_element_shape,
+                                                            &element_shape));
+
       auto* handle_data = c->input_handle_shapes_and_types(0);
       if (handle_data != nullptr && handle_data->size() != 1) {
         return errors::InvalidArgument(
@@ -231,10 +237,10 @@ REGISTER_OP("TensorListConcat")
               DataTypeString(list_shape_type.dtype), " but expected type ",
               DataTypeString(element_dtype));
         }
-        shape_inference::ShapeHandle ignored;
+        shape_inference::ShapeHandle merged;
         TF_RETURN_IF_ERROR(
-            c->Merge(element_shape, list_shape_type.shape, &ignored));
-        element_shape = list_shape_type.shape;
+            c->Merge(element_shape, list_shape_type.shape, &merged));
+        element_shape = merged;
       }
       if (c->RankKnown(element_shape)) {
         shape_inference::ShapeHandle result;
@@ -367,6 +373,24 @@ REGISTER_OP("TensorListGetItem")
       return Status::OK();
     });
 
+REGISTER_OP("TensorListResize")
+    .Input("input_handle: variant")
+    .Input("size: int32")
+    .Output("output_handle: variant")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Check that `size` has scalar shape.
+      shape_inference::ShapeHandle size_shape = c->input(1);
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(size_shape, 0, &unused));
+      c->set_output(0, c->Scalar());
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr) {
+        c->set_output_handle_shapes_and_types(0, *handle_data);
+      }
+      return Status::OK();
+    });
+
 REGISTER_OP("TensorListSetItem")
     .Input("input_handle: variant")
     .Input("index: int32")
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 0e94259f755ee6dbb9d6d98e2514a1424ca309e2..0f4f72593746100fc9bc82a4d7070fa361c5e86f 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -269,10 +269,11 @@ REGISTER_OP("Conv2D")
     .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
-    .Attr(GetPaddingAttrString())
+    .Attr(GetPaddingAttrStringWithExplicit())
+    .Attr(GetExplicitPaddingsAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
-    .SetShapeFn(shape_inference::Conv2DShape);
+    .SetShapeFn(shape_inference::Conv2DShapeWithExplicitPadding);
 
 REGISTER_OP("Conv2DBackpropInput")
     .Input("input_sizes: int32")
@@ -282,7 +283,8 @@ REGISTER_OP("Conv2DBackpropInput")
     .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
-    .Attr(GetPaddingAttrString())
+    .Attr(GetPaddingAttrStringWithExplicit())
+    .Attr(GetExplicitPaddingsAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
@@ -304,7 +306,8 @@ REGISTER_OP("Conv2DBackpropFilter")
     .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
-    .Attr(GetPaddingAttrString())
+    .Attr(GetPaddingAttrStringWithExplicit())
+    .Attr(GetExplicitPaddingsAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index b8497ae2b134661806aee99cb7c138f8b25e028c..4a1b3a6f2db83682dc70a83b01844a84bfb5f129 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -5876,6 +5876,15 @@ op {
       list {
         s: "SAME"
         s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
       }
     }
   }
@@ -5953,6 +5962,15 @@ op {
       list {
         s: "SAME"
         s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
       }
     }
   }
@@ -6030,6 +6048,15 @@ op {
       list {
         s: "SAME"
         s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
       }
     }
   }
@@ -26773,6 +26800,43 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceScatterNdSub"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceScatterNdUpdate"
   input_arg {
@@ -36719,6 +36783,15 @@ op {
     name: "element_dtype"
     type: "type"
   }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
 }
 op {
   name: "TensorListConcatLists"
@@ -36924,6 +36997,22 @@ op {
     }
   }
 }
+op {
+  name: "TensorListResize"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
 op {
   name: "TensorListScatter"
   input_arg {
diff --git a/tensorflow/core/ops/state_ops.cc b/tensorflow/core/ops/state_ops.cc
index aa975cb77bafb3b31f0d612d0f662cef0bde06f2..d2bf033461ebdc99889bae5357704205e6172501 100644
--- a/tensorflow/core/ops/state_ops.cc
+++ b/tensorflow/core/ops/state_ops.cc
@@ -231,6 +231,15 @@ REGISTER_OP("ResourceScatterNdAdd")
     .Attr("use_locking: bool = true")
     .SetShapeFn(shape_inference::ScatterNdUpdateShape);
 
+REGISTER_OP("ResourceScatterNdSub")
+    .Input("ref: resource")
+    .Input("indices: Tindices")
+    .Input("updates: T")
+    .Attr("T: type")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = true")
+    .SetShapeFn(shape_inference::ScatterNdUpdateShape);
+
 REGISTER_OP("ScatterNdAdd")
     .Input("ref: Ref(T)")
     .Input("indices: Tindices")
diff --git a/tensorflow/core/platform/cloud/oauth_client.cc b/tensorflow/core/platform/cloud/oauth_client.cc
index 9b85cae9b90eabfd303ee465ac90e9121c7285cf..a8657359a3561d84b37a47a2696641e869ed567a 100644
--- a/tensorflow/core/platform/cloud/oauth_client.cc
+++ b/tensorflow/core/platform/cloud/oauth_client.cc
@@ -95,6 +95,11 @@ Status CreateSignature(RSA* private_key, StringPiece to_sign,
   if (!md) {
     return errors::Internal("Could not get a sha256 encryptor.");
   }
+
+  // EVP_MD_CTX_destroy is renamed to EVP_MD_CTX_free in OpenSSL 1.1.0 but
+  // the old name is still retained as a compatibility macro.
+  // Keep this around until support is dropped for OpenSSL 1.0
+  // https://www.openssl.org/news/cl110.txt
   std::unique_ptr<EVP_MD_CTX, std::function<void(EVP_MD_CTX*)>> md_ctx(
       EVP_MD_CTX_create(), [](EVP_MD_CTX* ptr) { EVP_MD_CTX_destroy(ptr); });
   if (!md_ctx) {
@@ -119,7 +124,6 @@ Status CreateSignature(RSA* private_key, StringPiece to_sign,
   if (EVP_DigestSignFinal(md_ctx.get(), sig.get(), &sig_len) != 1) {
     return errors::Internal("DigestFinal (signature compute) failed.");
   }
-  EVP_MD_CTX_cleanup(md_ctx.get());
   return Base64Encode(StringPiece(reinterpret_cast<char*>(sig.get()), sig_len),
                       signature);
 }
diff --git a/tensorflow/core/platform/cloud/oauth_client_test.cc b/tensorflow/core/platform/cloud/oauth_client_test.cc
index 1cd0641cd3a7dd8376a365f243d63cbfc6b177c2..ce3b9d79c8b12c85a47b5ee6a773f9fadccb2127 100644
--- a/tensorflow/core/platform/cloud/oauth_client_test.cc
+++ b/tensorflow/core/platform/cloud/oauth_client_test.cc
@@ -166,7 +166,6 @@ TEST(OAuthClientTest, GetTokenFromServiceAccountJson) {
                 const_cast<unsigned char*>(
                     reinterpret_cast<const unsigned char*>(signature.data())),
                 signature.size()));
-  EVP_MD_CTX_cleanup(md_ctx);
 
   // Free all the crypto-related resources.
   EVP_PKEY_free(key);
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 769e28902521c64c0020caf08cfd97a948eaac10..e26828c75e4476089c239fbdb3f03cf6c9fb6b11 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -663,6 +663,8 @@ def tf_additional_cloud_op_deps():
         "//tensorflow:ios": [],
         "//tensorflow:linux_s390x": [],
         "//tensorflow:windows": [],
+        "//tensorflow:api_version_2": [],
+        "//tensorflow:windows_and_api_version_2": [],
         "//tensorflow:no_gcp_support": [],
         "//conditions:default": [
             "//tensorflow/contrib/cloud:bigquery_reader_ops_op_lib",
@@ -670,13 +672,15 @@ def tf_additional_cloud_op_deps():
         ],
     })
 
-# TODO(jart, jhseu): Delete when GCP is default on.
+# TODO(jhseu): Delete when GCP is default on.
 def tf_additional_cloud_kernel_deps():
     return select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
         "//tensorflow:linux_s390x": [],
         "//tensorflow:windows": [],
+        "//tensorflow:api_version_2": [],
+        "//tensorflow:windows_and_api_version_2": [],
         "//tensorflow:no_gcp_support": [],
         "//conditions:default": [
             "//tensorflow/contrib/cloud/kernels:bigquery_reader_ops",
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index da1f66dc6763121819fe443066acc40c1d5fa79d..c0c1022c1629f95bd17e651545facbfa088dd441 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -32,16 +32,24 @@ cc_library(
 
 tf_cuda_library(
     name = "stream_executor",
+    cuda_deps = ["//tensorflow/stream_executor/cuda:cuda_activation"],
     deps = [
         "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:dnn",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:multi_platform_manager",
+        "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor/cuda:cuda_platform_id",
+        "//tensorflow/stream_executor/host:host_platform_id",
+        "//tensorflow/stream_executor/platform:dso_loader",
     ] + select({
-        "//tensorflow:using_cuda_clang": ["//tensorflow/stream_executor:cuda_platform"],
-        "//tensorflow:using_cuda_nvcc": ["//tensorflow/stream_executor:cuda_platform"],
-        "//tensorflow:using_cuda_clang_with_dynamic_build": [],
-        "//tensorflow:using_cuda_nvcc_with_dynamic_build": [],
+        "@local_config_cuda//cuda:darwin": ["IOKit"],
         "//conditions:default": [],
     }) + select({
-        "@local_config_cuda//cuda:darwin": ["IOKit"],
+        "//tensorflow:using_cuda_clang": ["//tensorflow/stream_executor/cuda:all_runtime"],
+        "//tensorflow:using_cuda_nvcc": ["//tensorflow/stream_executor/cuda:all_runtime"],
+        "//tensorflow:using_cuda_clang_with_dynamic_build": [],
+        "//tensorflow:using_cuda_nvcc_with_dynamic_build": [],
         "//conditions:default": [],
     }),
 )
@@ -49,9 +57,10 @@ tf_cuda_library(
 cc_library(
     name = "stream_executor_cuda",
     deps = [
-        "//tensorflow/stream_executor",
+        ":stream_executor_no_cuda",
+        ":cuda",
     ] + if_static(
-        ["//tensorflow/stream_executor:cuda_platform"],
+        ["//tensorflow/stream_executor/cuda:all_runtime"],
     ) + select({
         "@local_config_cuda//cuda:darwin": ["IOKit"],
         "//conditions:default": [],
@@ -62,23 +71,31 @@ cc_library(
     name = "stream_executor_no_cuda",
     deps = [
         "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:dnn",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:multi_platform_manager",
+        "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor/cuda:cuda_platform_id",
+        "//tensorflow/stream_executor/host:host_platform",
+        "//tensorflow/stream_executor/host:host_platform_id",
+        "//tensorflow/stream_executor/platform:dso_loader",
     ],
 )
 
 # Dummy stream executor cuda plugins.
 cc_library(
     name = "cublas_plugin",
-    srcs = [],
+    deps = if_static(["//tensorflow/stream_executor/cuda:cublas_plugin"]),
 )
 
 cc_library(
     name = "cufft_plugin",
-    srcs = [],
+    deps = if_static(["//tensorflow/stream_executor/cuda:cufft_plugin"]),
 )
 
 cc_library(
     name = "cudnn_plugin",
-    srcs = [],
+    deps = if_static(["//tensorflow/stream_executor/cuda:cudnn_plugin"]),
 )
 
 # OSX framework for device driver access
diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index afc4201e5382194b02b8b0f5cdebfc90688c9f00..63394174455089c64e1e889e35f578437f7fb4fc 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -29,6 +29,9 @@ limitations under the License.
 #include "tensorflow/core/platform/windows/wide_char.h"
 #define PATH_MAX MAX_PATH
 #else
+#include <fcntl.h>
+#include <string.h>
+#include <sys/types.h>
 #include <unistd.h>
 #endif
 
@@ -314,7 +317,31 @@ string Env::GetExecutablePath() {
   string file_path = WideCharToUtf8(wc_file_path);
   std::copy(file_path.begin(), file_path.end(), exe_path);
 #else
-  CHECK_NE(-1, readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1));
+  char buf[PATH_MAX] = {0};
+  int path_length = readlink("/proc/self/exe", buf, sizeof(buf) - 1);
+  CHECK_NE(-1, path_length);
+
+  if (strstr(buf, "python") != nullptr) {
+    // Discard the path of the python binary, and any flags.
+    int fd = open("/proc/self/cmdline", O_RDONLY);
+    int cmd_length = read(fd, buf, PATH_MAX - 1);
+    CHECK_NE(-1, cmd_length);
+    int token_pos = 0;
+    for (bool token_is_first_or_flag = true; token_is_first_or_flag;) {
+      // Get token length, including null
+      int token_len = strlen(&buf[token_pos]) + 1;
+      token_is_first_or_flag = false;
+      // Check if we can skip without overshooting
+      if (token_pos + token_len < cmd_length) {
+        token_pos += token_len;
+        token_is_first_or_flag = (buf[token_pos] == '-');  // token is a flag
+      }
+    }
+    snprintf(exe_path, sizeof(exe_path), "%s", &buf[token_pos]);
+  } else {
+    snprintf(exe_path, sizeof(exe_path), "%s", buf);
+  }
+
 #endif
   // Make sure it's null-terminated:
   exe_path[sizeof(exe_path) - 1] = 0;
diff --git a/tensorflow/core/platform/fake_python_env_test.cc b/tensorflow/core/platform/fake_python_env_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b521db3c054bff0e324a3b0571e0af7f47c269c4
--- /dev/null
+++ b/tensorflow/core/platform/fake_python_env_test.cc
@@ -0,0 +1,65 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file has "python" in its name. Thus, it should trigger the python
+// specific code paths.
+
+#include <sys/stat.h>
+#include <unistd.h>
+#include <iostream>
+#include <string>
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+
+int myargc;
+char** myargv;
+
+char kMagicBazelDirSubstring[] = ".runfiles/org_tensorflow";
+char kPythonFile[] =
+    "/some/path/to/pythontest.runfiles/org_tensorflow/stuff/to/run.py";
+
+namespace tensorflow {
+
+TEST(FakePythonEnvTest, GetExecutablePath) {
+  // See if argc is greater than 1 and first arg is kPythonFile
+  // If not, rerun the executable with proper args.
+  if (myargc <= 1 || strstr(myargv[1], kMagicBazelDirSubstring) == nullptr) {
+    const char* filename = myargv[0];
+    char* new_argv[] = {
+        myargv[0],
+        kPythonFile,
+        nullptr,
+    };
+
+    execv(filename, new_argv);
+  }
+
+  Env* env = Env::Default();
+  // We depend on the file/executable name to include python and fool the
+  // library to think this is running under the python interpreter.
+  string path = env->GetExecutablePath();
+  EXPECT_TRUE(strstr(path.c_str(), kMagicBazelDirSubstring) != nullptr);
+}
+
+}  // namespace tensorflow
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  myargc = argc;
+  myargv = argv;
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/core/platform/posix/env.cc b/tensorflow/core/platform/posix/env.cc
index 0a939aef25236dc33e2be8ec1d76f9ea0075e350..d87e5dcfe70cc802a8ac5865445f508ff795aa34 100644
--- a/tensorflow/core/platform/posix/env.cc
+++ b/tensorflow/core/platform/posix/env.cc
@@ -121,13 +121,25 @@ class PosixEnv : public Env {
 
   string GetRunfilesDir() override {
     string bin_path = this->GetExecutablePath();
-    string runfiles_path = bin_path + ".runfiles/org_tensorflow";
+    string runfiles_suffix = ".runfiles/org_tensorflow";
+    std::size_t pos = bin_path.find(runfiles_suffix);
+
+    // Sometimes (when executing under python) bin_path returns the full path to
+    // the python scripts under runfiles. Get the substring.
+    if (pos != std::string::npos) {
+      return bin_path.substr(0, pos + runfiles_suffix.length());
+    }
+
+    // See if we have the executable path. if executable.runfiles exists, return
+    // that folder.
+    string runfiles_path = bin_path + runfiles_suffix;
     Status s = this->IsDirectory(runfiles_path);
     if (s.ok()) {
       return runfiles_path;
-    } else {
-      return bin_path.substr(0, bin_path.find_last_of("/\\"));
     }
+
+    // If nothing can be found, return something close.
+    return bin_path.substr(0, bin_path.find_last_of("/\\"));
   }
 
  private:
diff --git a/tensorflow/core/platform/setround.cc b/tensorflow/core/platform/setround.cc
index 592626bfa17e691d1b10ddce5c7f0f31ed825861..5573b2fc93f8b28777e78ad50d423ecb57409821 100644
--- a/tensorflow/core/platform/setround.cc
+++ b/tensorflow/core/platform/setround.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/platform/setround.h"
 
+#include <cfenv>  // NOLINT
+
 namespace tensorflow {
 namespace port {
 
diff --git a/tensorflow/core/platform/stream_executor.h b/tensorflow/core/platform/stream_executor.h
index 0a590b3d40c0dbf007feee07fc93be4838924679..42822859f6e12372511f10809bd416b5054b7202 100644
--- a/tensorflow/core/platform/stream_executor.h
+++ b/tensorflow/core/platform/stream_executor.h
@@ -18,11 +18,6 @@ limitations under the License.
 
 #include "tensorflow/core/platform/platform.h"
 
-#if defined(PLATFORM_GOOGLE)
-#include "tensorflow/stream_executor/platform/google/dso_loader.h"
-#else
-#include "tensorflow/stream_executor/dso_loader.h"
-#endif
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/dnn.h"
@@ -31,6 +26,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "tensorflow/stream_executor/scratch_allocator.h"
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor.h"
diff --git a/tensorflow/core/platform/stream_executor_no_cuda.h b/tensorflow/core/platform/stream_executor_no_cuda.h
index 50a5e732c0ec222d3ee2329a57fc6ea9ac4b233c..123035cc8a69cd895ad92a505951cc3441b27988 100644
--- a/tensorflow/core/platform/stream_executor_no_cuda.h
+++ b/tensorflow/core/platform/stream_executor_no_cuda.h
@@ -18,11 +18,6 @@ limitations under the License.
 
 #include "tensorflow/core/platform/platform.h"
 
-#if defined(PLATFORM_GOOGLE)
-#include "tensorflow/stream_executor/platform/google/dso_loader.h"
-#else
-#include "tensorflow/stream_executor/dso_loader.h"
-#endif
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/dnn.h"
@@ -31,6 +26,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "tensorflow/stream_executor/scratch_allocator.h"
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor.h"
diff --git a/tensorflow/contrib/tensorboard/db/BUILD b/tensorflow/core/summary/BUILD
similarity index 98%
rename from tensorflow/contrib/tensorboard/db/BUILD
rename to tensorflow/core/summary/BUILD
index 6507546ee9f81108add181a9c83064c9860005e2..a89175cdb1db2ff1184d8da26bc180d578faaf69 100644
--- a/tensorflow/contrib/tensorboard/db/BUILD
+++ b/tensorflow/core/summary/BUILD
@@ -1,5 +1,5 @@
 # Description:
-#   TensorBoard database code.
+#   C++ implementation code for the summary writing APIs.
 
 package(default_visibility = ["//tensorflow:internal"])
 
diff --git a/tensorflow/contrib/tensorboard/db/loader.cc b/tensorflow/core/summary/loader.cc
similarity index 97%
rename from tensorflow/contrib/tensorboard/db/loader.cc
rename to tensorflow/core/summary/loader.cc
index 6439328022329cbc56d767e787ec9d6797045768..68535feacfae6d8c9edf6b0725fe4d4c8d63bf60 100644
--- a/tensorflow/contrib/tensorboard/db/loader.cc
+++ b/tensorflow/core/summary/loader.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include <iostream>
 #include <vector>
 
-#include "tensorflow/contrib/tensorboard/db/schema.h"
-#include "tensorflow/contrib/tensorboard/db/summary_db_writer.h"
+#include "tensorflow/core/summary/schema.h"
+#include "tensorflow/core/summary/summary_db_writer.h"
 #include "tensorflow/core/lib/db/sqlite.h"
 #include "tensorflow/core/lib/io/record_reader.h"
 #include "tensorflow/core/platform/init_main.h"
diff --git a/tensorflow/contrib/tensorboard/db/schema.cc b/tensorflow/core/summary/schema.cc
similarity index 99%
rename from tensorflow/contrib/tensorboard/db/schema.cc
rename to tensorflow/core/summary/schema.cc
index 3c7bc87e4a2dbeadef2b9589d58c845204049123..822e2fa3bfdaf2be5f03704fc83d39f0e00369d3 100644
--- a/tensorflow/contrib/tensorboard/db/schema.cc
+++ b/tensorflow/core/summary/schema.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/tensorboard/db/schema.h"
+#include "tensorflow/core/summary/schema.h"
 
 #include "tensorflow/core/lib/core/errors.h"
 
diff --git a/tensorflow/contrib/tensorboard/db/schema.h b/tensorflow/core/summary/schema.h
similarity index 87%
rename from tensorflow/contrib/tensorboard/db/schema.h
rename to tensorflow/core/summary/schema.h
index 3da450422523dbe4304446869a38d43981d76eb5..6305f8eabd7cacb9dca8922b694e92ca4596d777 100644
--- a/tensorflow/contrib/tensorboard/db/schema.h
+++ b/tensorflow/core/summary/schema.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_TENSORBOARD_DB_SCHEMA_H_
-#define TENSORFLOW_CONTRIB_TENSORBOARD_DB_SCHEMA_H_
+#ifndef TENSORFLOW_CORE_SUMMARY_SCHEMA_H_
+#define TENSORFLOW_CORE_SUMMARY_SCHEMA_H_
 
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/db/sqlite.h"
@@ -30,4 +30,4 @@ Status SetupTensorboardSqliteDb(Sqlite* db);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORBOARD_DB_SCHEMA_H_
+#endif  // TENSORFLOW_CORE_SUMMARY_SCHEMA_H_
diff --git a/tensorflow/contrib/tensorboard/db/schema_test.cc b/tensorflow/core/summary/schema_test.cc
similarity index 95%
rename from tensorflow/contrib/tensorboard/db/schema_test.cc
rename to tensorflow/core/summary/schema_test.cc
index 4d3f2880bd02682ad00a90760f2a4478f1e6b2a2..fa21b45b62cca2b116010de87a2dc2bae5cbe866 100644
--- a/tensorflow/contrib/tensorboard/db/schema_test.cc
+++ b/tensorflow/core/summary/schema_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/tensorboard/db/schema.h"
+#include "tensorflow/core/summary/schema.h"
 
 #include <memory>
 
diff --git a/tensorflow/contrib/tensorboard/db/summary_converter.cc b/tensorflow/core/summary/summary_converter.cc
similarity index 99%
rename from tensorflow/contrib/tensorboard/db/summary_converter.cc
rename to tensorflow/core/summary/summary_converter.cc
index 93c1183072b4d791843e740f970234ba52857463..e6e34e9602fa8cc3ed91d773d1d4cbec0d0c5232 100644
--- a/tensorflow/contrib/tensorboard/db/summary_converter.cc
+++ b/tensorflow/core/summary/summary_converter.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/tensorboard/db/summary_converter.h"
+#include "tensorflow/core/summary/summary_converter.h"
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/summary.pb.h"
diff --git a/tensorflow/contrib/tensorboard/db/summary_converter.h b/tensorflow/core/summary/summary_converter.h
similarity index 89%
rename from tensorflow/contrib/tensorboard/db/summary_converter.h
rename to tensorflow/core/summary/summary_converter.h
index 329c7f9f2f9fe25cdff8d5ac2e52c25362f624c2..dc005d2604ff1687e765341ebdb9e86c62c78f3a 100644
--- a/tensorflow/contrib/tensorboard/db/summary_converter.h
+++ b/tensorflow/core/summary/summary_converter.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_CONVERTER_H_
-#define TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_CONVERTER_H_
+#ifndef TENSORFLOW_CORE_SUMMARY_SUMMARY_CONVERTER_H_
+#define TENSORFLOW_CORE_SUMMARY_SUMMARY_CONVERTER_H_
 
 #include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -35,4 +35,4 @@ Status AddTensorAsAudioToSummary(const Tensor& tensor, const string& tag,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_CONVERTER_H_
+#endif  // TENSORFLOW_CORE_SUMMARY_SUMMARY_CONVERTER_H_
diff --git a/tensorflow/contrib/tensorboard/db/summary_db_writer.cc b/tensorflow/core/summary/summary_db_writer.cc
similarity index 99%
rename from tensorflow/contrib/tensorboard/db/summary_db_writer.cc
rename to tensorflow/core/summary/summary_db_writer.cc
index cfdc884277a025aa11995d329389f3748b17490c..b203d439ccf82b36b3d0e1bdd958fdcfac87f4b0 100644
--- a/tensorflow/contrib/tensorboard/db/summary_db_writer.cc
+++ b/tensorflow/core/summary/summary_db_writer.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/tensorboard/db/summary_db_writer.h"
+#include "tensorflow/core/summary/summary_db_writer.h"
 
 #include <deque>
 
-#include "tensorflow/contrib/tensorboard/db/summary_converter.h"
+#include "tensorflow/core/summary/summary_converter.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -972,7 +972,7 @@ class SummaryDbWriter : public SummaryWriterInterface {
     return MigrateEvent(std::move(e));
   }
 
-  string DebugString() override { return "SummaryDbWriter"; }
+  string DebugString() const override { return "SummaryDbWriter"; }
 
  private:
   Status Write(int64 step, const Tensor& t, const string& tag,
diff --git a/tensorflow/contrib/tensorboard/db/summary_db_writer.h b/tensorflow/core/summary/summary_db_writer.h
similarity index 89%
rename from tensorflow/contrib/tensorboard/db/summary_db_writer.h
rename to tensorflow/core/summary/summary_db_writer.h
index 746da1533b157bf7b2be5c85ada8b61ba224cc3e..5669afe7f67e1019d3d62d45ea99a64f1a31c82e 100644
--- a/tensorflow/contrib/tensorboard/db/summary_db_writer.h
+++ b/tensorflow/core/summary/summary_db_writer.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_DB_WRITER_H_
-#define TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_DB_WRITER_H_
+#ifndef TENSORFLOW_CORE_SUMMARY_SUMMARY_DB_WRITER_H_
+#define TENSORFLOW_CORE_SUMMARY_SUMMARY_DB_WRITER_H_
 
 #include "tensorflow/core/kernels/summary_interface.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -39,4 +39,4 @@ Status CreateSummaryDbWriter(Sqlite* db, const string& experiment_name,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_DB_WRITER_H_
+#endif  // TENSORFLOW_CORE_SUMMARY_SUMMARY_DB_WRITER_H_
diff --git a/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc b/tensorflow/core/summary/summary_db_writer_test.cc
similarity index 99%
rename from tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc
rename to tensorflow/core/summary/summary_db_writer_test.cc
index 2e8d4109dd624ab66d774668ad04def9a7d3cdf2..c4e9ddea2c51673c94273900b0407517b6533f3d 100644
--- a/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc
+++ b/tensorflow/core/summary/summary_db_writer_test.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/tensorboard/db/summary_db_writer.h"
+#include "tensorflow/core/summary/summary_db_writer.h"
 
-#include "tensorflow/contrib/tensorboard/db/schema.h"
+#include "tensorflow/core/summary/schema.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer.cc b/tensorflow/core/summary/summary_file_writer.cc
similarity index 97%
rename from tensorflow/contrib/tensorboard/db/summary_file_writer.cc
rename to tensorflow/core/summary/summary_file_writer.cc
index 22b6f09d0cd88068f7bedabe7687920420a3028f..711a7d3d1007090259f34652f10cf43a4d0c5f0a 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
+++ b/tensorflow/core/summary/summary_file_writer.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/tensorboard/db/summary_file_writer.h"
+#include "tensorflow/core/summary/summary_file_writer.h"
 
-#include "tensorflow/contrib/tensorboard/db/summary_converter.h"
+#include "tensorflow/core/summary/summary_converter.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
@@ -148,7 +148,7 @@ class SummaryFileWriter : public SummaryWriterInterface {
     return Status::OK();
   }
 
-  string DebugString() override { return "SummaryFileWriter"; }
+  string DebugString() const override { return "SummaryFileWriter"; }
 
  private:
   double GetWallTime() {
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer.h b/tensorflow/core/summary/summary_file_writer.h
similarity index 89%
rename from tensorflow/contrib/tensorboard/db/summary_file_writer.h
rename to tensorflow/core/summary/summary_file_writer.h
index 73b0a5542beabdc460c32156dd44aacc5f08610a..7d964516da3ceecdc4cdedae000ba873ec92e1e9 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer.h
+++ b/tensorflow/core/summary/summary_file_writer.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_FILE_WRITER_H_
-#define TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_FILE_WRITER_H_
+#ifndef TENSORFLOW_CORE_SUMMARY_SUMMARY_FILE_WRITER_H_
+#define TENSORFLOW_CORE_SUMMARY_SUMMARY_FILE_WRITER_H_
 
 #include "tensorflow/core/kernels/summary_interface.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -40,4 +40,4 @@ Status CreateSummaryFileWriter(int max_queue, int flush_millis,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_FILE_WRITER_H_
+#endif  // TENSORFLOW_CORE_SUMMARY_SUMMARY_FILE_WRITER_H_
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc b/tensorflow/core/summary/summary_file_writer_test.cc
similarity index 99%
rename from tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
rename to tensorflow/core/summary/summary_file_writer_test.cc
index ffbfb9533e887e54b0f5bdfde11dadce21073a94..d3b19c3abdb8b773e22472c5987d91852fc6ac8e 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
+++ b/tensorflow/core/summary/summary_file_writer_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/tensorboard/db/summary_file_writer.h"
+#include "tensorflow/core/summary/summary_file_writer.h"
 
 #include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
diff --git a/tensorflow/contrib/tensorboard/db/vacuum.cc b/tensorflow/core/summary/vacuum.cc
similarity index 100%
rename from tensorflow/contrib/tensorboard/db/vacuum.cc
rename to tensorflow/core/summary/vacuum.cc
diff --git a/tensorflow/core/util/event.proto b/tensorflow/core/util/event.proto
index 9ce85be551191dee754f34ec531e65f3eac056b7..2d3ae62777358ee371c60fe9b04d27d140c6f414 100644
--- a/tensorflow/core/util/event.proto
+++ b/tensorflow/core/util/event.proto
@@ -95,7 +95,7 @@ enum WorkerHealth {
 // signal is received.
 enum WorkerShutdownMode {
   DEFAULT = 0;
-  SHUTDOWN_IMMEDIATELY = 1;
+  NOT_CONFIGURED = 1;
   WAIT_FOR_COORDINATOR = 2;
 }
 
diff --git a/tensorflow/core/util/padding.cc b/tensorflow/core/util/padding.cc
index 117de5ee4bdd61af148ad7f1e620e940cb38216a..9e7fb8489e8e37b94ebecd53fde0568c68879c92 100644
--- a/tensorflow/core/util/padding.cc
+++ b/tensorflow/core/util/padding.cc
@@ -29,12 +29,55 @@ Status GetNodeAttr(const NodeDef& node_def, StringPiece attr_name,
     *value = SAME;
   } else if (str_value == "VALID") {
     *value = VALID;
+  } else if (str_value == "EXPLICIT") {
+    *value = EXPLICIT;
   } else {
     return errors::NotFound(str_value, " is not an allowed padding type");
   }
   return Status::OK();
 }
 
+Status CheckValidPadding(Padding padding_type,
+                         const std::vector<int64>& explicit_paddings,
+                         int num_dims, TensorFormat data_format) {
+  if (padding_type == Padding::EXPLICIT) {
+    if (explicit_paddings.size() != 2 * num_dims) {
+      return errors::InvalidArgument(
+          "explicit_paddings attribute must contain ", 2 * num_dims,
+          " values, but got: ", explicit_paddings.size());
+    }
+    for (int64 padding_value : explicit_paddings) {
+      if (padding_value < 0) {
+        return errors::InvalidArgument(
+            "All elements of explicit_paddings must be nonnegative");
+      }
+    }
+    const int32 batch_index = GetTensorBatchDimIndex(num_dims, data_format);
+    const int32 depth_index = GetTensorFeatureDimIndex(num_dims, data_format);
+    if (explicit_paddings[2 * batch_index] != 0 ||
+        explicit_paddings[2 * batch_index + 1] != 0 ||
+        explicit_paddings[2 * depth_index] != 0 ||
+        explicit_paddings[2 * depth_index + 1] != 0) {
+      return errors::InvalidArgument(
+          "Nonzero explicit padding in the batch or depth dimensions is not "
+          "supported");
+    }
+  } else if (!explicit_paddings.empty()) {
+    return errors::InvalidArgument(
+        "explicit_paddings attribute must be empty if the padding attribute is "
+        "not EXPLICIT");
+  }
+  return Status::OK();
+}
+
 string GetPaddingAttrString() { return "padding: {'SAME', 'VALID'}"; }
 
+string GetPaddingAttrStringWithExplicit() {
+  return "padding: {'SAME', 'VALID', 'EXPLICIT'}";
+}
+
+string GetExplicitPaddingsAttrString() {
+  return "explicit_paddings: list(int) = []";
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/util/padding.h b/tensorflow/core/util/padding.h
index 76f9b4dd9a99e7b4e152ca0c06b9323acf84b13d..a1dd1c0bd9556935f233609683a79452f3692e06 100644
--- a/tensorflow/core/util/padding.h
+++ b/tensorflow/core/util/padding.h
@@ -20,8 +20,10 @@ limitations under the License.
 // kernels.
 
 #include <string>
+#include <vector>
 
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
 
@@ -34,16 +36,29 @@ class NodeDef;
 //   VALID: No padding is carried out.
 //   SAME: The pad value is computed so that the output will have the same
 //         dimensions as the input.
+//   EXPLICIT: The user specifies the pad values in the explicit_padding
+//             attribute.
 // The padded area is zero-filled.
 enum Padding {
-  VALID = 1,  // No padding.
-  SAME = 2,   // Input and output layers have the same size.
+  VALID = 1,     // No padding.
+  SAME = 2,      // Input and output layers have the same size.
+  EXPLICIT = 3,  // Padding is explicitly specified
 };
 
+// Returns an error if the padding attributes are invalid.
+Status CheckValidPadding(Padding padding_type,
+                         const std::vector<int64>& explicit_paddings,
+                         int num_dims, TensorFormat data_format);
+
 // Return the string containing the list of valid padding types, that can be
 // used as an Attr() in REGISTER_OP.
 string GetPaddingAttrString();
 
+// Like GetPaddingAttrString(), but also includes EXPLICIT.
+string GetPaddingAttrStringWithExplicit();
+
+string GetExplicitPaddingsAttrString();
+
 // Specialization to parse an attribute directly into a Padding enum.
 Status GetNodeAttr(const NodeDef& node_def, StringPiece attr_name,
                    Padding* value);
diff --git a/tensorflow/core/util/presized_cuckoo_map.h b/tensorflow/core/util/presized_cuckoo_map.h
index f88ad2faaff344832d65b04357c3d8c2665ebad5..1cdde34562a7616827850fde830373350138687d 100644
--- a/tensorflow/core/util/presized_cuckoo_map.h
+++ b/tensorflow/core/util/presized_cuckoo_map.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/prefetch.h"
 
 namespace tensorflow {
 
@@ -132,6 +133,15 @@ class PresizedCuckooMap {
            FindInBucket(k, fast_map_to_buckets(h2(tk)), out);
   }
 
+  // Prefetch memory associated with the key k into cache levels specified by
+  // hint.
+  template <port::PrefetchHint hint = port::PREFETCH_HINT_T0>
+  void PrefetchKey(const key_type k) const {
+    const uint64 tk = key_transform(k);
+    port::prefetch<hint>(&buckets_[fast_map_to_buckets(tk)].keys);
+    port::prefetch<hint>(&buckets_[fast_map_to_buckets(h2(tk))].keys);
+  }
+
   int64 MemoryUsed() const {
     return sizeof(PresizedCuckooMap<value>) + sizeof(CuckooPathQueue);
   }
diff --git a/tensorflow/core/util/presized_cuckoo_map_test.cc b/tensorflow/core/util/presized_cuckoo_map_test.cc
index f2be1e8a2fffdd9b61839809667a858a512751d2..f2c7904b00452487ceef4a8f8a870af548e1af03 100644
--- a/tensorflow/core/util/presized_cuckoo_map_test.cc
+++ b/tensorflow/core/util/presized_cuckoo_map_test.cc
@@ -13,12 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/util/presized_cuckoo_map.h"
 #include <array>
+
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/util/presized_cuckoo_map.h"
 
 namespace tensorflow {
 namespace {
@@ -50,6 +51,14 @@ TEST(PresizedCuckooMapTest, Basic) {
   EXPECT_EQ(out, 2);
 }
 
+TEST(PresizedCuckooMapTest, Prefetch) {
+  PresizedCuckooMap<int64> pscm(2);
+  EXPECT_TRUE(pscm.InsertUnique(1, 2));
+  // Works for both present and absent keys.
+  pscm.PrefetchKey(1);
+  pscm.PrefetchKey(2);
+}
+
 TEST(PresizedCuckooMapTest, TooManyItems) {
   static constexpr int kTableSize = 1000;
   PresizedCuckooMap<int> pscm(kTableSize);
diff --git a/tensorflow/core/util/stats_calculator.h b/tensorflow/core/util/stats_calculator.h
index e191737bb2c8eb85518e51b3a06884a7983a392e..5005ee08a4bf3292097820983ad85a8b56377a82 100644
--- a/tensorflow/core/util/stats_calculator.h
+++ b/tensorflow/core/util/stats_calculator.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <stdlib.h>
 
+#include <algorithm>
 #include <cmath>
 #include <limits>
 #include <map>
diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h
index a296fb447e252e62809aeb17d9d00cf35ad15fc9..643e14e0b56bb152b5ca135cd4b813108b8eab16 100644
--- a/tensorflow/core/util/tensor_format.h
+++ b/tensorflow/core/util/tensor_format.h
@@ -408,18 +408,24 @@ inline int32 GetTensorDimIndex(TensorFormat format, char dimension) {
   return GetTensorDimIndex<2>(format, dimension);
 }
 
+inline int32 GetTensorDimIndex(TensorFormat format, char dimension,
+                               int num_total_dims) {
+  int32 index = (GetTensorSpatialDims(num_total_dims, format) == 3)
+                    ? GetTensorDimIndex<3>(format, dimension)
+                    : GetTensorDimIndex<2>(format, dimension);
+  CHECK(index >= 0 && index < num_total_dims)  // Crash OK.
+      << "Invalid index from the dimension: " << index << ", " << format << ", "
+      << dimension;
+  return index;
+}
+
 // Return the element from 'dimension_attributes' that corresponds to the
 // specified 'dimension' according to 'tensor_format'.
 template <typename T>
 T GetTensorDim(gtl::ArraySlice<T> dimension_attributes,
                TensorFormat tensor_format, char dimension) {
   int index =
-      (GetTensorSpatialDims(dimension_attributes.size(), tensor_format) == 3)
-          ? GetTensorDimIndex<3>(tensor_format, dimension)
-          : GetTensorDimIndex<2>(tensor_format, dimension);
-  CHECK(index >= 0 && index < dimension_attributes.size())
-      << "Invalid index from the dimension: " << index << ", " << tensor_format
-      << ", " << dimension;
+      GetTensorDimIndex(tensor_format, dimension, dimension_attributes.size());
   return dimension_attributes[index];
 }
 
@@ -476,6 +482,15 @@ inline int64 GetFilterDim(const Tensor& tensor,
   return GetFilterDim(tensor.shape(), filter_tensor_format, dimension);
 }
 
+inline void GetExplicitPaddingForDim(
+    const std::vector<int64>& explicit_paddings, TensorFormat tensor_format,
+    char dimension, int64* padding_before, int64* padding_after) {
+  int index =
+      GetTensorDimIndex(tensor_format, dimension, explicit_paddings.size() / 2);
+  *padding_before = explicit_paddings[2 * index];
+  *padding_after = explicit_paddings[2 * index + 1];
+}
+
 // Return the string that specifies the data format for convnet operations.
 string GetConvnetDataFormatAttrString();
 string GetConvnet3dDataFormatAttrString();
diff --git a/tensorflow/examples/learn/BUILD b/tensorflow/examples/learn/BUILD
index d6ec1f393bab82a45f0c1032670b5abed42bf6d3..a22d55e5af7630d5660a59970244357897aa1aa3 100644
--- a/tensorflow/examples/learn/BUILD
+++ b/tensorflow/examples/learn/BUILD
@@ -28,17 +28,8 @@ sh_test(
     size = "large",
     srcs = ["examples_test.sh"],
     data = [
-        ":boston",
-        ":iris",
         ":iris_custom_decay_dnn",
         ":iris_custom_model",
-        ":iris_run_config",
-        ":random_forest_mnist",
-        ":resnet",
-        ":text_classification",
-        ":text_classification_character_cnn",
-        ":text_classification_character_rnn",
-        ":text_classification_cnn",
     ],
     tags = [
         "manual",
diff --git a/tensorflow/examples/speech_commands/label_wav.py b/tensorflow/examples/speech_commands/label_wav.py
index 0017aec3a54bdcd2ddaec6a1012d629f83564827..eb8323454c23c07d5b536bbdfec30d690767a0fd 100644
--- a/tensorflow/examples/speech_commands/label_wav.py
+++ b/tensorflow/examples/speech_commands/label_wav.py
@@ -45,7 +45,7 @@ FLAGS = None
 
 def load_graph(filename):
   """Unpersists graph from file as default graph."""
-  with tf.gfile.FastGFile(filename, 'rb') as f:
+  with tf.gfile.GFile(filename, 'rb') as f:
     graph_def = tf.GraphDef()
     graph_def.ParseFromString(f.read())
     tf.import_graph_def(graph_def, name='')
diff --git a/tensorflow/examples/speech_commands/label_wav_dir.py b/tensorflow/examples/speech_commands/label_wav_dir.py
index a34db512dda86be138e07a4ffaa1963fe00a5cea..2e1890c3e864b153a4e01badf08b5b55b4377ab6 100644
--- a/tensorflow/examples/speech_commands/label_wav_dir.py
+++ b/tensorflow/examples/speech_commands/label_wav_dir.py
@@ -46,7 +46,7 @@ FLAGS = None
 
 def load_graph(filename):
   """Unpersists graph from file as default graph."""
-  with tf.gfile.FastGFile(filename, 'rb') as f:
+  with tf.gfile.GFile(filename, 'rb') as f:
     graph_def = tf.GraphDef()
     graph_def.ParseFromString(f.read())
     tf.import_graph_def(graph_def, name='')
diff --git a/tensorflow/go/genop/internal/api_def_map.go b/tensorflow/go/genop/internal/api_def_map.go
index 8600452b476dee49292cbffe630026cf6077e22b..0bbd88b61c345906a13944aa3c7ad7b0582fffae 100644
--- a/tensorflow/go/genop/internal/api_def_map.go
+++ b/tensorflow/go/genop/internal/api_def_map.go
@@ -31,7 +31,7 @@ import (
 	"unsafe"
 
 	"github.com/golang/protobuf/proto"
-	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/tensorflow/core/framework_go_proto"
+	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core/framework"
 )
 
 // Encapsulates a collection of API definitions.
diff --git a/tensorflow/go/genop/internal/genop.go b/tensorflow/go/genop/internal/genop.go
index fb8163121850cee36e1fcc652ca258b1fe2d42ff..1c05715a1a2f50b857c78e8c192d6c865b70e6c7 100644
--- a/tensorflow/go/genop/internal/genop.go
+++ b/tensorflow/go/genop/internal/genop.go
@@ -47,7 +47,7 @@ import (
 	"unsafe"
 
 	"github.com/golang/protobuf/proto"
-	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/tensorflow/core/framework_go_proto"
+	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core/framework"
 )
 
 // GenerateFunctionsForRegisteredOps writes a Go source code file to w
diff --git a/tensorflow/go/genop/internal/genop_test.go b/tensorflow/go/genop/internal/genop_test.go
index d20d22e0c1502f92ade7ef5aa40985dce73b7552..acce6dea67c2e93309df70dd5009ad0dc086c523 100644
--- a/tensorflow/go/genop/internal/genop_test.go
+++ b/tensorflow/go/genop/internal/genop_test.go
@@ -22,7 +22,7 @@ import (
 	"testing"
 
 	"github.com/golang/protobuf/proto"
-	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/tensorflow/core/framework_go_proto"
+	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core/framework"
 )
 
 // Creates an ApiDef based on opdef and applies overrides
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 90f0cc3b4849f2806bb31a7e838c6c92be383335..6f6fb793a02598782b1a8f592aeef3970dc4d99b 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -327,6 +327,192 @@ func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQua
 	return op.Output(0)
 }
 
+// Subtracts sparse `updates` from an existing tensor according to `indices`.
+//
+// This operation creates a new tensor by subtracting sparse `updates` from the
+// passed in `tensor`.
+// This operation is very similar to `tf.scatter_nd_sub`, except that the updates
+// are subtracted from an existing tensor (as opposed to a variable). If the memory
+// for the existing tensor cannot be re-used, a copy is made and updated.
+//
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+//
+//     indices.shape[-1] <= shape.rank
+//
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
+//
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
+//
+// The simplest form of tensor_scatter_sub is to subtract individual elements
+// from a tensor by index. For example, say we want to insert 4 scattered elements
+// in a rank-1 tensor with 8 elements.
+//
+// In Python, this scatter subtract operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     tensor = tf.ones([8], dtype=tf.int32)
+//     updated = tf.tensor_scatter_sub(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [1, -10, 1, -9, -8, 1, 1, -11]
+//
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
+//
+// In Python, this scatter add operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     tensor = tf.ones([4, 4, 4])
+//     updated = tf.tensor_scatter_sub(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [[[-4, -4, -4, -4], [-5, -5, -5, -5], [-6, -6, -6, -6], [-7, -7, -7, -7]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+//      [[-4, -4, -4, -4], [-5, -5, -5, -5], [-6, -6, -6, -6], [-7, -7, -7, -7]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, the index is ignored.
+//
+// Arguments:
+//	tensor: Tensor to copy/update.
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//
+// Returns A new tensor copied from tensor and updates subtracted according to the indices.
+func TensorScatterSub(scope *Scope, tensor tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorScatterSub",
+		Input: []tf.Input{
+			tensor, indices, updates,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Scatter `updates` into an existing tensor according to `indices`.
+//
+// This operation creates a new tensor by applying sparse `updates` to the passed
+// in `tensor`.
+// This operation is very similar to `tf.scatter_nd`, except that the updates are
+// scattered onto an existing tensor (as opposed to a zero-tensor). If the memory
+// for the existing tensor cannot be re-used, a copy is made and updated.
+//
+// If `indices` contains duplicates, then their updates are accumulated (summed).
+//
+// **WARNING**: The order in which updates are applied is nondeterministic, so the
+// output will be nondeterministic if `indices` contains duplicates -- because
+// of some numerical approximation issues, numbers summed in different order
+// may yield different results.
+//
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+//
+//     indices.shape[-1] <= shape.rank
+//
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
+//
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
+//
+// The simplest form of scatter is to insert individual elements in a tensor by
+// index. For example, say we want to insert 4 scattered elements in a rank-1
+// tensor with 8 elements.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
+// </div>
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     tensor = tf.ones([8], dtype=tf.int32)
+//     updated = tf.tensor_scatter_update(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [1, 11, 1, 10, 9, 1, 1, 12]
+//
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     tensor = tf.ones([4, 4, 4])
+//     updated = tf.tensor_scatter_update(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+//      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, the index is ignored.
+//
+// Arguments:
+//	tensor: Tensor to copy/update.
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//
+// Returns A new tensor with the given shape and updates applied according
+// to the indices.
+func TensorScatterUpdate(scope *Scope, tensor tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorScatterUpdate",
+		Input: []tf.Input{
+			tensor, indices, updates,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Scatter `updates` into a new tensor according to `indices`.
 //
 // Creates a new tensor by applying sparse `updates` to individual values or
@@ -334,6 +520,10 @@ func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQua
 // the given `shape` according to indices.  This operator is the inverse of the
 // `tf.gather_nd` operator which extracts values or slices from a given tensor.
 //
+// This operation is similar to tensor_scatter_add, except that the tensor is
+// zero-initialized. Calling `tf.scatter_nd(indices, values, shape)` is identical
+// to `tensor_scatter_add(tf.zeros(shape, values.dtype), indices, values)`
+//
 // If `indices` contains duplicates, then their updates are accumulated (summed).
 //
 // **WARNING**: The order in which updates are applied is nondeterministic, so the
@@ -464,6 +654,15 @@ func QuantizeAndDequantizeV2RangeGiven(value bool) QuantizeAndDequantizeV2Attr {
 }
 
 // QuantizeAndDequantizeV2RoundMode sets the optional round_mode attribute to value.
+//
+// value: The 'round_mode' attribute controls which rounding tie-breaking algorithm is
+// used when rounding float values to their quantized equivalents. The following
+// rounding modes are currently supported:
+//
+// *   HALF_TO_EVEN: this is the default round_mode.
+// *   HALF_UP: round towards positive. In this mode 7.5 rounds up to 8 and -7.5
+//     rounds up to -7.
+//
 // If not specified, defaults to "HALF_TO_EVEN"
 func QuantizeAndDequantizeV2RoundMode(value string) QuantizeAndDequantizeV2Attr {
 	return func(m optionalAttr) {
@@ -523,7 +722,7 @@ func QuantizeAndDequantizeV2RoundMode(value string) QuantizeAndDequantizeV2Attr
 //
 // output = round(clamp(value, input_min, input_max) * scale_factor) / scale_factor.
 //
-// The above round function uses half to even rounding.
+// The above round function rounds the value based on the given round_mode.
 //
 //
 // Arguments:
@@ -3422,11 +3621,11 @@ func PopulationCount(scope *Scope, x tf.Output) (y tf.Output) {
 // bucketized values for a single feature.
 //
 // Arguments:
-//	float_values: float; List of Rank 2 Tensor each containing float values for a single feature.
+//	float_values: float; List of Rank 1 Tensor each containing float values for a single feature.
 //	bucket_boundaries: float; List of Rank 1 Tensors each containing the bucket boundaries for a single
 // feature.
 //
-// Returns int; List of Rank 2 Tensors each containing the bucketized values for a single feature.
+// Returns int; List of Rank 1 Tensors each containing the bucketized values for a single feature.
 func BoostedTreesBucketize(scope *Scope, float_values []tf.Output, bucket_boundaries []tf.Output) (buckets []tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -3497,15 +3696,16 @@ func BoostedTreesQuantileStreamResourceFlush(scope *Scope, quantile_stream_resou
 
 // Makes the summary of quantiles for the batch.
 //
-// An op that takes a list of tensors and outputs the quantile summaries for each tensor.
+// An op that takes a list of tensors (one tensor per feature) and outputs the
+// quantile summaries for each tensor.
 //
 // Arguments:
-//	float_values: float; List of Rank 2 Tensors each containing values for a single feature.
+//	float_values: float; List of Rank 1 Tensors each containing values for a single feature.
 //	example_weights: float; Rank 1 Tensor with weights per instance.
 //	epsilon: float; The required maximum approximation error.
 //
-// Returns float; List of Rank 2 Tensors each containing the quantile summary (value, weight,
-// min_rank, max_rank) of a single feature.
+// Returns float; List of Rank 2 Tensors each containing the quantile summary
+// (value, weight, min_rank, max_rank) of a single feature.
 func BoostedTreesMakeQuantileSummaries(scope *Scope, float_values []tf.Output, example_weights tf.Output, epsilon tf.Output) (summaries []tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -3806,6 +4006,70 @@ func BoostedTreesEnsembleResourceHandleOp(scope *Scope, optional ...BoostedTrees
 	return op.Output(0)
 }
 
+// Output the logits for the given input data
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource.
+//	dense_features: Rank 2 dense features tensor.
+//	logits_dimension: Scalar, dimension of the logits.
+//
+// Returns The logits predictions from the tree for each instance in the batch.
+func TensorForestTreePredict(scope *Scope, tree_handle tf.Output, dense_features tf.Output, logits_dimension int64) (logits tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreePredict",
+		Input: []tf.Input{
+			tree_handle, dense_features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Get the number of nodes in a tree
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource.
+//
+// Returns The size of the tree.
+func TensorForestTreeSize(scope *Scope, tree_handle tf.Output) (tree_size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreeSize",
+		Input: []tf.Input{
+			tree_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a tree resource and returns a handle to it.
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource to be created.
+//	tree_config: Serialized proto string of the boosted_trees.Tree.
+//
+// Returns the created operation.
+func TensorForestCreateTreeVariable(scope *Scope, tree_handle tf.Output, tree_config tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestCreateTreeVariable",
+		Input: []tf.Input{
+			tree_handle, tree_config,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
 type ComputeAccidentalHitsAttr func(optionalAttr)
 
@@ -4829,6 +5093,119 @@ func CudnnRNNParamsToCanonical(scope *Scope, num_layers tf.Output, num_units tf.
 	return weights, biases
 }
 
+// CudnnRNNBackpropV3Attr is an optional argument to CudnnRNNBackpropV3.
+type CudnnRNNBackpropV3Attr func(optionalAttr)
+
+// CudnnRNNBackpropV3RnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNBackpropV3RnnMode(value string) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNBackpropV3InputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNBackpropV3InputMode(value string) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNBackpropV3Direction sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNBackpropV3Direction(value string) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNBackpropV3Dropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV3Dropout(value float32) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNBackpropV3Seed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV3Seed(value int64) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNBackpropV3Seed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV3Seed2(value int64) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Backprop step of CudnnRNNV3.
+//
+// Compute the backprop of both data and weights in a RNN. Takes an extra
+//     "sequence_lengths" input than CudnnRNNBackprop.
+//
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicates whether there is a linear projection between the input and
+//     the actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// sequence_lengths: a vector of lengths of each input sequence.
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
+// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
+//     pass.
+// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
+//     pass.
+// reserve_space: The same reserve_space produced in the forward operation.
+// input_backprop: The backprop to input in the forward pass. Has the same shape
+//     as input.
+// input_h_backprop: The backprop to input_h in the forward pass. Has the same
+//     shape as input_h.
+// input_c_backprop: The backprop to input_c in the forward pass. Has the same
+//     shape as input_c.
+// params_backprop: The backprop to the params buffer in the forward pass. Has the
+//     same shape as params.
+func CudnnRNNBackpropV3(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, sequence_lengths tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, host_reserved tf.Output, optional ...CudnnRNNBackpropV3Attr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNBackpropV3",
+		Input: []tf.Input{
+			input, input_h, input_c, params, sequence_lengths, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space, host_reserved,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
 // CudnnRNNBackpropV2Attr is an optional argument to CudnnRNNBackpropV2.
 type CudnnRNNBackpropV2Attr func(optionalAttr)
 
@@ -6697,6 +7074,34 @@ func Sign(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Creates a dataset that passes a sliding window over `input_dataset`.
+//
+// Arguments:
+//
+//	window_size: A scalar representing the number of elements in the
+// sliding window.
+//	window_shift: A scalar representing the steps moving the sliding window
+// forward in one iteration. It must be positive.
+//	window_stride: A scalar representing the stride of the input elements of the sliding window.
+// It must be positive.
+//
+//
+func ExperimentalSlidingWindowDataset(scope *Scope, input_dataset tf.Output, window_size tf.Output, window_shift tf.Output, window_stride tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalSlidingWindowDataset",
+		Input: []tf.Input{
+			input_dataset, window_size, window_shift, window_stride,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns which elements of x are finite.
 //
 // @compatibility(numpy)
@@ -7072,6 +7477,19 @@ func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
 	}
 }
 
+// Conv2DBackpropFilterExplicitPaddings sets the optional explicit_paddings attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DBackpropFilterExplicitPaddings(value []int64) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
 // Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
 //
 // value: Specify the data format of the input and output data. With the
@@ -7351,6 +7769,47 @@ func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Get the value of the tensor specified by its handle.
+//
+// Arguments:
+//	handle: The handle for a tensor stored in the session state.
+//	dtype: The type of the output value.
+//
+// Returns The tensor for the given handle.
+func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "GetSessionTensor",
+		Input: []tf.Input{
+			handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient for the sqrt of `x` wrt its input.
+//
+// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SqrtGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // MatrixInverseAttr is an optional argument to MatrixInverse.
 type MatrixInverseAttr func(optionalAttr)
 
@@ -7567,6 +8026,33 @@ func Inv(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Creates a dataset that batches input elements into a SparseTensor.
+//
+// Arguments:
+//	input_dataset: A handle to an input dataset. Must have a single component.
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	row_shape: A vector representing the dense shape of each row in the produced
+// SparseTensor. The shape may be partially specified, using `-1` to indicate
+// that a particular dimension should use the maximum size of all batch elements.
+//
+//
+func ExperimentalDenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalDenseToSparseBatchDataset",
+		Input: []tf.Input{
+			input_dataset, batch_size, row_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ComplexAbsAttr is an optional argument to ComplexAbs.
 type ComplexAbsAttr func(optionalAttr)
 
@@ -7703,7 +8189,7 @@ func RegexReplaceReplaceGlobal(value bool) RegexReplaceAttr {
 // Arguments:
 //	input: The text to be processed.
 //	pattern: The regular expression to match the input.
-//	rewrite: The rewrite to be applied to the matched expresion.
+//	rewrite: The rewrite to be applied to the matched expression.
 //
 // Returns The text after applying pattern and rewrite.
 func RegexReplace(scope *Scope, input tf.Output, pattern tf.Output, rewrite tf.Output, optional ...RegexReplaceAttr) (output tf.Output) {
@@ -8401,6 +8887,26 @@ func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output,
 	return scope.AddOperation(opspec)
 }
 
+// Checks whether a tree has been initialized.
+//
+// Arguments:
+//	tree_handle: Handle to the tree.
+//
+// Returns Whether the tree is initialized.
+func TensorForestTreeIsInitializedOp(scope *Scope, tree_handle tf.Output) (is_initialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreeIsInitializedOp",
+		Input: []tf.Input{
+			tree_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Gets next element for the provided shard number.
 //
 // Arguments:
@@ -10936,7 +11442,6 @@ func OneHotAxis(value int64) OneHotAttr {
 // =========
 //
 // Suppose that
-//
 // ```
 //   indices = [0, 2, -1, 1]
 //   depth = 3
@@ -10946,16 +11451,15 @@ func OneHotAxis(value int64) OneHotAttr {
 // ```
 //
 // Then output is `[4 x 3]`:
-//
-//     ```output =
-//       [5.0 0.0 0.0]  // one_hot(0)
-//       [0.0 0.0 5.0]  // one_hot(2)
-//       [0.0 0.0 0.0]  // one_hot(-1)
-//       [0.0 5.0 0.0]  // one_hot(1)
-//     ```
+// ```
+// output =
+//   [5.0 0.0 0.0]  // one_hot(0)
+//   [0.0 0.0 5.0]  // one_hot(2)
+//   [0.0 0.0 0.0]  // one_hot(-1)
+//   [0.0 5.0 0.0]  // one_hot(1)
+// ```
 //
 // Suppose that
-//
 // ```
 //   indices = [0, 2, -1, 1]
 //   depth = 3
@@ -10965,19 +11469,19 @@ func OneHotAxis(value int64) OneHotAttr {
 // ```
 //
 // Then output is `[3 x 4]`:
+// ```
+// output =
+//   [0.0 3.0 3.0 3.0]
+//   [3.0 3.0 3.0 0.0]
+//   [3.0 3.0 3.0 3.0]
+//   [3.0 0.0 3.0 3.0]
+// //  ^                one_hot(0)
+// //      ^            one_hot(2)
+// //          ^        one_hot(-1)
+// //              ^    one_hot(1)
+// ```
 //
-//     ```output =
-//       [0.0 3.0 3.0 3.0]
-//       [3.0 3.0 3.0 0.0]
-//       [3.0 3.0 3.0 3.0]
-//       [3.0 0.0 3.0 3.0]
-//     //  ^                one_hot(0)
-//     //      ^            one_hot(2)
-//     //          ^        one_hot(-1)
-//     //              ^    one_hot(1)
-//     ```
 // Suppose that
-//
 // ```
 //   indices = [[0, 2], [1, -1]]
 //   depth = 3
@@ -10987,15 +11491,16 @@ func OneHotAxis(value int64) OneHotAttr {
 // ```
 //
 // Then output is `[2 x 2 x 3]`:
-//
-//     ```output =
-//       [
-//         [1.0, 0.0, 0.0]  // one_hot(0)
-//         [0.0, 0.0, 1.0]  // one_hot(2)
-//       ][
-//         [0.0, 1.0, 0.0]  // one_hot(1)
-//         [0.0, 0.0, 0.0]  // one_hot(-1)
-//       ]```
+// ```
+// output =
+//   [
+//     [1.0, 0.0, 0.0]  // one_hot(0)
+//     [0.0, 0.0, 1.0]  // one_hot(2)
+//   ][
+//     [0.0, 1.0, 0.0]  // one_hot(1)
+//     [0.0, 0.0, 0.0]  // one_hot(-1)
+//   ]
+// ```
 //
 // Arguments:
 //	indices: A tensor of indices.
@@ -11113,15 +11618,12 @@ func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthEleme
 //
 // Arguments:
 //
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.END
-//   }
-//   out_arg {
-//     name: "output"
-//     description: <<END
-// Has same shape as data, except for the first `segment_ids.rank`
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
 // dimensions, which are replaced with a single dimension which has size
 // `num_segments`.
-//
 func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -11553,8 +12055,8 @@ func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...R
 // Arguments:
 //	params_nested_splits: The `nested_row_splits` tensors that define the row-partitioning for the
 // `params` RaggedTensor input.
-//	params_dense_values: The `inner_values` for the `params` RaggedTensor. There was a terminology change
-// at the python level from dense_values to inner_values, so dense_values is the
+//	params_dense_values: The `flat_values` for the `params` RaggedTensor. There was a terminology change
+// at the python level from dense_values to flat_values, so dense_values is the
 // deprecated name.
 //	indices: Indices in the outermost dimension of `params` of the values that should be
 // gathered.
@@ -11563,7 +12065,7 @@ func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...R
 // `indices.shape.ndims + params.ragged_rank - 1`.
 //
 // Returns The `nested_row_splits` tensors that define the row-partitioning for the
-// returned RaggedTensor.The `inner_values` for the returned RaggedTensor.
+// returned RaggedTensor.The `flat_values` for the returned RaggedTensor.
 func RaggedGather(scope *Scope, params_nested_splits []tf.Output, params_dense_values tf.Output, indices tf.Output, OUTPUT_RAGGED_RANK int64) (output_nested_splits []tf.Output, output_dense_values tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -11644,7 +12146,7 @@ func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_ou
 //
 // Arguments:
 //	rt_nested_splits: The `row_splits` for the `RaggedTensor`.
-//	rt_dense_values: The `inner_values` for the `RaggedTensor`.
+//	rt_dense_values: The `flat_values` for the `RaggedTensor`.
 //
 // Returns The indices for the `SparseTensor`.The values of the `SparseTensor`.`sparse_dense_shape` is a tight bounding box of the input `RaggedTensor`.
 func RaggedTensorToSparse(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output) (sparse_indices tf.Output, sparse_values tf.Output, sparse_dense_shape tf.Output) {
@@ -11787,217 +12289,6 @@ func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
-type ResourceSparseApplyRMSPropAttr func(optionalAttr)
-
-// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
-//
-// Returns the created operation.
-func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyRMSProp",
-		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
-type SampleDistortedBoundingBoxAttr func(optionalAttr)
-
-// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to non-zero, the random number
-// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-// seed.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
-//
-// value: The cropped area of the image must contain at least this
-// fraction of any bounding box supplied. The value of this parameter should be
-// non-negative. In the case of 0, the cropped area does not need to overlap
-// any of the bounding boxes supplied.
-// If not specified, defaults to 0.1
-func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["min_object_covered"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
-//
-// value: The cropped area of the image must have an aspect ratio =
-// width / height within this range.
-// If not specified, defaults to <f:0.75 f:1.33 >
-func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["aspect_ratio_range"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
-//
-// value: The cropped area of the image must contain a fraction of the
-// supplied image within this range.
-// If not specified, defaults to <f:0.05 f:1 >
-func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["area_range"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
-//
-// value: Number of attempts at generating a cropped region of the image
-// of the specified constraints. After `max_attempts` failures, return the entire
-// image.
-// If not specified, defaults to 100
-func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["max_attempts"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
-//
-// value: Controls behavior if no bounding boxes supplied.
-// If true, assume an implicit bounding box covering the whole input. If false,
-// raise an error.
-// If not specified, defaults to false
-func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["use_image_if_no_bounding_boxes"] = value
-	}
-}
-
-// Generate a single randomly distorted bounding box for an image.
-//
-// Bounding box annotations are often supplied in addition to ground-truth labels
-// in image recognition or object localization tasks. A common technique for
-// training such a system is to randomly distort an image while preserving
-// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-// localization of an object, i.e. bounding box, given an `image_size`,
-// `bounding_boxes` and a series of constraints.
-//
-// The output of this Op is a single bounding box that may be used to crop the
-// original image. The output is returned as 3 tensors: `begin`, `size` and
-// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-// what the bounding box looks like.
-//
-// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example,
-//
-// ```python
-//     # Generate a single distorted bounding box.
-//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-//         tf.shape(image),
-//         bounding_boxes=bounding_boxes)
-//
-//     # Draw the bounding box in an image summary.
-//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-//                                                   bbox_for_draw)
-//     tf.summary.image('images_with_box', image_with_box)
-//
-//     # Employ the bounding box to distort the image.
-//     distorted_image = tf.slice(image, begin, size)
-// ```
-//
-// Note that if no bounding box information is available, setting
-// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-// false and no bounding boxes are supplied, an error is raised.
-//
-// Arguments:
-//	image_size: 1-D, containing `[height, width, channels]`.
-//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-// associated with the image.
-//
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBox",
-		Input: []tf.Input{
-			image_size, bounding_boxes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
 // Computes sigmoid of `x` element-wise.
 //
 // Specifically, `y = 1 / (1 + exp(-x))`.
@@ -13106,9 +13397,7 @@ func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
 	}
 }
 
-// Adds sparse `updates` to individual values or slices within a given
-//
-// variable according to `indices`.
+// Applies sparse addition to individual values or slices in a Variable.
 //
 // `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 //
@@ -13122,24 +13411,24 @@ func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
 // `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 //
 // ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
 // ```
 //
-// For example, say we want to update 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that update would look like this:
+// For example, say we want to add 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that addition would look like this:
 //
 // ```python
-//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
-//     indices = tf.constant([[4], [3], [1] ,[7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     update = tf.scatter_nd_add(ref, indices, updates)
-//     with tf.Session() as sess:
-//       print sess.run(update)
+// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+// indices = tf.constant([[4], [3], [1], [7]])
+// updates = tf.constant([9, 10, 11, 12])
+// add = tf.scatter_nd_add(ref, indices, updates)
+// with tf.Session() as sess:
+//   print sess.run(add)
 // ```
 //
 // The resulting update to ref would look like this:
 //
-//     [1, 12, 3, 14, 14, 6, 7, 20]
+//     [1, 13, 3, 14, 14, 6, 7, 20]
 //
 // See `tf.scatter_nd` for more details about how to make updates to
 // slices.
@@ -13263,63 +13552,6 @@ func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.O
 	return op.Output(0)
 }
 
-// Returns the element-wise sum of a list of tensors.
-//
-// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
-// wait for all of its inputs to be ready before beginning to sum. This can
-// save memory if inputs are ready at different times, since minimum temporary
-// storage is proportional to the output size rather than the inputs size.
-//
-// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
-//
-// Returns a `Tensor` of same shape and type as the elements of `inputs`.
-//
-// Arguments:
-//	inputs: A list of `Tensor` objects, each with same shape and type.
-//	shape: Shape of elements of `inputs`.
-func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shape": shape}
-	opspec := tf.OpSpec{
-		Type: "AccumulateNV2",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs deterministic pseudorandom random integers from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[minval, maxval)`.
-//
-// The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//	minval: Minimum value (inclusive, scalar).
-//	maxval: Maximum value (exclusive, scalar).
-//
-// Returns Random values with specified shape.
-func StatelessRandomUniformInt(scope *Scope, shape tf.Output, seed tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniformInt",
-		Input: []tf.Input{
-			shape, seed, minval, maxval,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
 type StatelessTruncatedNormalAttr func(optionalAttr)
 
@@ -13501,6 +13733,91 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 	return op.Output(0)
 }
 
+// UnicodeDecodeAttr is an optional argument to UnicodeDecode.
+type UnicodeDecodeAttr func(optionalAttr)
+
+// UnicodeDecodeErrors sets the optional errors attribute to value.
+//
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeDecodeErrors(value string) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeDecodeReplacementChar sets the optional replacement_char attribute to value.
+//
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
+// If not specified, defaults to 65533
+func UnicodeDecodeReplacementChar(value int64) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// UnicodeDecodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+//
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeDecodeReplaceControlCharacters(value bool) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["replace_control_characters"] = value
+	}
+}
+
+// Decodes each string in `input` into a sequence of Unicode code points.
+//
+// The character codepoints for all strings are returned using a single vector
+// `char_values`, with strings expanded to characters in row-major order.
+//
+// The `row_splits` tensor indicates where the codepoints for
+// each input string begin and end within the `char_values` tensor.
+// In particular, the values for the `i`th
+// string (in row-major order) are stored in the slice
+// `[row_splits[i]:row_splits[i+1]]`. Thus:
+//
+// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
+//   string (in row-major order).
+//
+// Arguments:
+//	input: The text to be decoded. Can have any shape. Note that the output is flattened
+// to a vector of char values.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+//
+// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.
+func UnicodeDecode(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeAttr) (row_splits tf.Output, char_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"input_encoding": input_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UnicodeDecode",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Adds up a SparseTensor and a dense Tensor, using these special rules:
 //
 // (1) Broadcasts the dense side to have the same shape as the sparse side, if
@@ -13549,6 +13866,84 @@ func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// UnicodeEncodeAttr is an optional argument to UnicodeEncode.
+type UnicodeEncodeAttr func(optionalAttr)
+
+// UnicodeEncodeErrors sets the optional errors attribute to value.
+//
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeEncodeErrors(value string) UnicodeEncodeAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeEncodeReplacementChar sets the optional replacement_char attribute to value.
+//
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD (U+65533).
+// If not specified, defaults to 65533
+func UnicodeEncodeReplacementChar(value int64) UnicodeEncodeAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// Encode a tensor of ints into unicode strings.
+//
+// Returns a vector of strings, where `output[i]` is constructed by encoding the
+// Unicode codepoints in `input_values[input_splits[i]:input_splits[i+1]]`
+// using `output_encoding`.
+//
+// ---
+//
+// Example:
+//
+// ```
+// input_values = [72, 101, 108, 108, 111, 87, 111, 114, 108, 100]
+// input_splits = [0, 5, 10]
+// output_encoding = 'UTF-8'
+//
+// output = ['Hello', 'World']
+// ```
+//
+// Arguments:
+//	input_values: A 1D tensor containing the unicode codepoints that should be encoded.
+//	input_splits: A 1D tensor specifying how the unicode codepoints should be split into strings.
+// In particular, `output[i]` is constructed by encoding the codepoints in the
+// slice `input_values[input_splits[i]:input_splits[i+1]]`.
+//	output_encoding: Unicode encoding of the output strings. Valid encodings are: `"UTF-8",
+// "UTF-16-BE", and "UTF-32-BE"`.
+//
+// Returns The 1-D Tensor of strings encoded from the provided unicode codepoints.
+func UnicodeEncode(scope *Scope, input_values tf.Output, input_splits tf.Output, output_encoding string, optional ...UnicodeEncodeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_encoding": output_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UnicodeEncode",
+		Input: []tf.Input{
+			input_values, input_splits,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns the number of tensors in the input tensor list.
 //
 // input_handle: the input list
@@ -14871,6 +15266,117 @@ func MultiDeviceIteratorToStringHandle(scope *Scope, multi_device_iterator tf.Ou
 	return op.Output(0)
 }
 
+// CudnnRNNV3Attr is an optional argument to CudnnRNNV3.
+type CudnnRNNV3Attr func(optionalAttr)
+
+// CudnnRNNV3RnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNV3RnnMode(value string) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNV3InputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNV3InputMode(value string) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNV3Direction sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNV3Direction(value string) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNV3Dropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV3Dropout(value float32) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNV3Seed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV3Seed(value int64) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNV3Seed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV3Seed2(value int64) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// CudnnRNNV3IsTraining sets the optional is_training attribute to value.
+// If not specified, defaults to true
+func CudnnRNNV3IsTraining(value bool) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// A RNN backed by cuDNN.
+//
+// Computes the RNN from the input and initial states, with respect to the params
+// buffer. Accepts one extra input "sequence_lengths" than CudnnRNN.
+//
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicates whether there is a linear projection between the input and
+//   the actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// sequence_lengths: a vector of lengths of each input sequence.
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// is_training: Indicates whether this operation is used for inferenece or
+//   training.
+// reserve_space: An opaque tensor that can be used in backprop calculation. It
+//   is only produced if is_training is true.
+func CudnnRNNV3(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, sequence_lengths tf.Output, optional ...CudnnRNNV3Attr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output, host_reserved tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNV3",
+		Input: []tf.Input{
+			input, input_h, input_c, params, sequence_lengths,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
 // Applies softmax to a batched N-D `SparseTensor`.
 //
 // The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
@@ -15247,6 +15753,217 @@ func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
+// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
+type ResourceSparseApplyRMSPropAttr func(optionalAttr)
+
+// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the RMSProp algorithm.
+//
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyRMSProp",
+		Input: []tf.Input{
+			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
+type SampleDistortedBoundingBoxAttr func(optionalAttr)
+
+// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
+//
+// value: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
+// If not specified, defaults to 0.1
+func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["min_object_covered"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+//
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+//
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+//
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+//
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["use_image_if_no_bounding_boxes"] = value
+	}
+}
+
+// Generate a single randomly distorted bounding box for an image.
+//
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
+//
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
+//
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example,
+//
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
+//
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.summary.image('images_with_box', image_with_box)
+//
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
+// ```
+//
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
+//
+// Arguments:
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
+//
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SampleDistortedBoundingBox",
+		Input: []tf.Input{
+			image_size, bounding_boxes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // LRNAttr is an optional argument to LRN.
 type LRNAttr func(optionalAttr)
 
@@ -16889,6 +17606,23 @@ func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Outp
 	return op.Output(0)
 }
 
+// Records the latency of producing `input_dataset` elements in a StatsAggregator.
+func ExperimentalLatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalLatencyStatsDataset",
+		Input: []tf.Input{
+			input_dataset, tag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
 type SparseTensorDenseMatMulAttr func(optionalAttr)
 
@@ -17007,6 +17741,90 @@ func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Out
 	return scope.AddOperation(opspec)
 }
 
+// Store the input tensor in the state of the current session.
+//
+// Arguments:
+//	value: The tensor to be stored.
+//
+// Returns The handle for the tensor stored in the session state, represented
+// as a ResourceHandle object.
+func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GetSessionHandleV2",
+		Input: []tf.Input{
+			value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
+type ResourceApplyAdamAttr func(optionalAttr)
+
+// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, uses the nesterov update.
+// If not specified, defaults to false
+func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the Adam algorithm.
+//
+// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+// $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdam",
+		Input: []tf.Input{
+			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // SerializeManySparseAttr is an optional argument to SerializeManySparse.
 type SerializeManySparseAttr func(optionalAttr)
 
@@ -17487,6 +18305,69 @@ func Timestamp(scope *Scope) (ts tf.Output) {
 	return op.Output(0)
 }
 
+// ResourceSparseApplyKerasMomentumAttr is an optional argument to ResourceSparseApplyKerasMomentum.
+type ResourceSparseApplyKerasMomentumAttr func(optionalAttr)
+
+// ResourceSparseApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyKerasMomentumUseLocking(value bool) ResourceSparseApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceSparseApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var + momentum * accum, so in the end, the var you get is actually
+// var + momentum * accum.
+// If not specified, defaults to false
+func ResourceSparseApplyKerasMomentumUseNesterov(value bool) ResourceSparseApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
+//
+// Set use_nesterov = True if you want to use Nesterov momentum.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+//
+// accum = accum * momentum - lr * grad
+// var += accum
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyKerasMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyKerasMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, indices, momentum,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // VariableShapeAttr is an optional argument to VariableShape.
 type VariableShapeAttr func(optionalAttr)
 
@@ -18225,45 +19106,94 @@ func DivNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the gradient for the sqrt of `x` wrt its input.
+// UnicodeDecodeWithOffsetsAttr is an optional argument to UnicodeDecodeWithOffsets.
+type UnicodeDecodeWithOffsetsAttr func(optionalAttr)
+
+// UnicodeDecodeWithOffsetsErrors sets the optional errors attribute to value.
 //
-// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeDecodeWithOffsetsErrors(value string) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "SqrtGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
+}
+
+// UnicodeDecodeWithOffsetsReplacementChar sets the optional replacement_char attribute to value.
+//
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
+// If not specified, defaults to 65533
+func UnicodeDecodeWithOffsetsReplacementChar(value int64) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Get the value of the tensor specified by its handle.
+// UnicodeDecodeWithOffsetsReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+//
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeDecodeWithOffsetsReplaceControlCharacters(value bool) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["replace_control_characters"] = value
+	}
+}
+
+// Decodes each string in `input` into a sequence of Unicode code points.
+//
+// The character codepoints for all strings are returned using a single vector
+// `char_values`, with strings expanded to characters in row-major order.
+// Similarly, the character start byte offsets are returned using a single vector
+// `char_to_byte_starts`, with strings expanded in row-major order.
+//
+// The `row_splits` tensor indicates where the codepoints and start offsets for
+// each input string begin and end within the `char_values` and
+// `char_to_byte_starts` tensors.  In particular, the values for the `i`th
+// string (in row-major order) are stored in the slice
+// `[row_splits[i]:row_splits[i+1]]`. Thus:
+//
+// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `char_to_bytes_starts[row_splits[i]+j]` is the start byte offset for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
+//   string (in row-major order).
 //
 // Arguments:
-//	handle: The handle for a tensor stored in the session state.
-//	dtype: The type of the output value.
+//	input: The text to be decoded. Can have any shape. Note that the output is flattened
+// to a vector of char values.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
 //
-// Returns The tensor for the given handle.
-func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
+// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.A 1D int32 Tensor containing the byte index in the input string where each
+// character in `char_values` starts.
+func UnicodeDecodeWithOffsets(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeWithOffsetsAttr) (row_splits tf.Output, char_values tf.Output, char_to_byte_starts tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"input_encoding": input_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "GetSessionTensor",
+		Type: "UnicodeDecodeWithOffsets",
 		Input: []tf.Input{
-			handle,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
 // Returns x - y element-wise.
@@ -19787,6 +20717,176 @@ func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_
 	return op.Output(0)
 }
 
+// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
+type QuantizedConv2DAttr func(optionalAttr)
+
+// QuantizedConv2DOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// QuantizedConv2DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2D convolution given quantized 4D input and filter tensors.
+//
+// The inputs are quantized tensors where the lowest value represents the real
+// number of the associated minimum, and the highest represents the maximum.
+// This means that you can only interpret the quantized output in the same way, by
+// taking the returned minimum and maximum values into account.
+//
+// Arguments:
+//
+//	filter: filter's input_depth dimension must match input's depth dimensions.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_filter: The float value that the lowest quantized filter value represents.
+//	max_filter: The float value that the highest quantized filter value represents.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedConv2D",
+		Input: []tf.Input{
+			input, filter, min_input, max_input, min_filter, max_filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// ResourceGatherAttr is an optional argument to ResourceGather.
+type ResourceGatherAttr func(optionalAttr)
+
+// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Gather slices from the variable pointed to by `resource` according to `indices`.
+//
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
+//
+// ```python
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
+//
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
+//
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+// ```
+func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceGather",
+		Input: []tf.Input{
+			resource, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Delete the TensorArray from its resource container.
+//
+// This enables the user to close and release the resource in the middle
+// of a step/run.
+//
+// Arguments:
+//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+//
+// Returns the created operation.
+func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayCloseV3",
+		Input: []tf.Input{
+			handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// StatelessMultinomialAttr is an optional argument to StatelessMultinomial.
+type StatelessMultinomialAttr func(optionalAttr)
+
+// StatelessMultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func StatelessMultinomialOutputDtype(value tf.DataType) StatelessMultinomialAttr {
+	return func(m optionalAttr) {
+		m["output_dtype"] = value
+	}
+}
+
+// Draws samples from a multinomial distribution.
+//
+// Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output, seed tf.Output, optional ...StatelessMultinomialAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessMultinomial",
+		Input: []tf.Input{
+			logits, num_samples, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
 //
 // This Op does not require `a_indices` be sorted in standard lexicographic order.
@@ -20332,6 +21432,63 @@ func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional .
 	return op.Output(0)
 }
 
+// Returns the element-wise sum of a list of tensors.
+//
+// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
+// wait for all of its inputs to be ready before beginning to sum. This can
+// save memory if inputs are ready at different times, since minimum temporary
+// storage is proportional to the output size rather than the inputs size.
+//
+// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
+//
+// Returns a `Tensor` of same shape and type as the elements of `inputs`.
+//
+// Arguments:
+//	inputs: A list of `Tensor` objects, each with same shape and type.
+//	shape: Shape of elements of `inputs`.
+func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape": shape}
+	opspec := tf.OpSpec{
+		Type: "AccumulateNV2",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs deterministic pseudorandom random integers from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[minval, maxval)`.
+//
+// The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//	minval: Minimum value (inclusive, scalar).
+//	maxval: Maximum value (exclusive, scalar).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniformInt(scope *Scope, shape tf.Output, seed tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniformInt",
+		Input: []tf.Input{
+			shape, seed, minval, maxval,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
 //
 // The Hurwitz zeta function is defined as:
@@ -20352,15 +21509,62 @@ func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// Returns the cardinality of `input_dataset`.
+//
+// Returns the cardinality of `input_dataset`.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the dataset to return cardinality for.
+//
+// Returns The cardinality of `input_dataset`. Named constants are used to represent
+// infinite and unknown cardinality.
+func ExperimentalDatasetCardinality(scope *Scope, input_dataset tf.Output) (cardinality tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalDatasetCardinality",
+		Input: []tf.Input{
+			input_dataset,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that executes a SQL query and emits rows of the result set.
+//
+// Arguments:
+//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
+//	data_source_name: A connection string to connect to the database.
+//	query: A SQL query to execute.
+//
+//
+func ExperimentalSqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalSqlDataset",
+		Input: []tf.Input{
+			driver_name, data_source_name, query,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Inverse fast Fourier transform.
 //
 // Computes the inverse 1-dimensional discrete Fourier transform over the
 // inner-most dimension of `input`.
 //
 // Arguments:
-//	input: A complex64 tensor.
+//	input: A complex tensor.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most
+// Returns A complex tensor of the same shape as `input`. The inner-most
 //   dimension of `input` is replaced with its inverse 1D Fourier transform.
 //
 // @compatibility(numpy)
@@ -20386,9 +21590,9 @@ func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
 // 2 dimensions of `input`.
 //
 // Arguments:
-//	input: A complex64 tensor.
+//	input: A complex tensor.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
+// Returns A complex tensor of the same shape as `input`. The inner-most 2
 //   dimensions of `input` are replaced with their 2D Fourier transform.
 //
 // @compatibility(numpy)
@@ -20414,9 +21618,9 @@ func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 // inner-most 2 dimensions of `input`.
 //
 // Arguments:
-//	input: A complex64 tensor.
+//	input: A complex tensor.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
+// Returns A complex tensor of the same shape as `input`. The inner-most 2
 //   dimensions of `input` are replaced with their inverse 2D Fourier transform.
 //
 // @compatibility(numpy)
@@ -20756,6 +21960,44 @@ func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_
 	return op.Output(0)
 }
 
+// TensorListConcatAttr is an optional argument to TensorListConcat.
+type TensorListConcatAttr func(optionalAttr)
+
+// TensorListConcatElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorListConcatElementShape(value tf.Shape) TensorListConcatAttr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// Concats all tensors in the list along the 0th dimension.
+//
+// Requires that all tensors have the same shape except the first dimension.
+//
+// input_handle: The input list.
+// tensor: The concated result.
+// lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
+//
+func TensorListConcat(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListConcatAttr) (tensor tf.Output, lengths tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListConcat",
+		Input: []tf.Input{
+			input_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Returns the set of files matching one or more glob patterns.
 //
 // Note that this routine only supports wildcard characters in the
@@ -21129,176 +22371,6 @@ func BatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output,
 	return op.Output(0)
 }
 
-// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
-type QuantizedConv2DAttr func(optionalAttr)
-
-// QuantizedConv2DOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// QuantizedConv2DDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 2D convolution given quantized 4D input and filter tensors.
-//
-// The inputs are quantized tensors where the lowest value represents the real
-// number of the associated minimum, and the highest represents the maximum.
-// This means that you can only interpret the quantized output in the same way, by
-// taking the returned minimum and maximum values into account.
-//
-// Arguments:
-//
-//	filter: filter's input_depth dimension must match input's depth dimensions.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_filter: The float value that the lowest quantized filter value represents.
-//	max_filter: The float value that the highest quantized filter value represents.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedConv2D",
-		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// StatelessMultinomialAttr is an optional argument to StatelessMultinomial.
-type StatelessMultinomialAttr func(optionalAttr)
-
-// StatelessMultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func StatelessMultinomialOutputDtype(value tf.DataType) StatelessMultinomialAttr {
-	return func(m optionalAttr) {
-		m["output_dtype"] = value
-	}
-}
-
-// Draws samples from a multinomial distribution.
-//
-// Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
-//	seed: 2 seeds (shape [2]).
-//
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output, seed tf.Output, optional ...StatelessMultinomialAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessMultinomial",
-		Input: []tf.Input{
-			logits, num_samples, seed,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceGatherAttr is an optional argument to ResourceGather.
-type ResourceGatherAttr func(optionalAttr)
-
-// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Gather slices from the variable pointed to by `resource` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
-//
-// ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
-//
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
-//
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-// ```
-func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceGather",
-		Input: []tf.Input{
-			resource, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Delete the TensorArray from its resource container.
-//
-// This enables the user to close and release the resource in the middle
-// of a step/run.
-//
-// Arguments:
-//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
-//
-// Returns the created operation.
-func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV3",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Saves the input tensors to disk.
 //
 // The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
@@ -21967,6 +23039,61 @@ func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output,
 	return op.Output(0), op.Output(1)
 }
 
+// ResourceApplyAdamWithAmsgradAttr is an optional argument to ResourceApplyAdamWithAmsgrad.
+type ResourceApplyAdamWithAmsgradAttr func(optionalAttr)
+
+// ResourceApplyAdamWithAmsgradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdamWithAmsgradUseLocking(value bool) ResourceApplyAdamWithAmsgradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the Adam algorithm.
+//
+// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+// $$vhat_t := max{vhat_{t-1}, v_t}$$
+// $$variable := variable - lr_t * m_t / (\sqrt{vhat_t} + \epsilon)$$
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	vhat: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdamWithAmsgrad(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, vhat tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamWithAmsgradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdamWithAmsgrad",
+		Input: []tf.Input{
+			var_, m, v, vhat, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // MapUnstageNoKeyAttr is an optional argument to MapUnstageNoKey.
 type MapUnstageNoKeyAttr func(optionalAttr)
 
@@ -22368,6 +23495,93 @@ func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shap
 	return op.Output(0)
 }
 
+// Adds sparse `updates` to an existing tensor according to `indices`.
+//
+// This operation creates a new tensor by adding sparse `updates` to the passed
+// in `tensor`.
+// This operation is very similar to `tf.scatter_nd_add`, except that the updates
+// are added onto an existing tensor (as opposed to a variable). If the memory
+// for the existing tensor cannot be re-used, a copy is made and updated.
+//
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+//
+//     indices.shape[-1] <= shape.rank
+//
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
+//
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
+//
+// The simplest form of tensor_scatter_add is to add individual elements to a
+// tensor by index. For example, say we want to add 4 elements in a rank-1
+// tensor with 8 elements.
+//
+// In Python, this scatter add operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     tensor = tf.ones([8], dtype=tf.int32)
+//     updated = tf.tensor_scatter_add(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [1, 12, 1, 11, 10, 1, 1, 13]
+//
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
+//
+// In Python, this scatter add operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     tensor = tf.ones([4, 4, 4])
+//     updated = tf.tensor_scatter_add(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [[[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+//      [[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, the index is ignored.
+//
+// Arguments:
+//	tensor: Tensor to copy/update.
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//
+// Returns A new tensor copied from tensor and updates added according to the indices.
+func TensorScatterAdd(scope *Scope, tensor tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorScatterAdd",
+		Input: []tf.Input{
+			tensor, indices, updates,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the sign and the log of the absolute value of the determinant of
 //
 // one or more square matrices.
@@ -22592,6 +23806,84 @@ func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+//
+// Arguments:
+//
+//	num_threads: Identifies the number of threads to use for the private threadpool.
+//
+//
+func ExperimentalPrivateThreadPoolDataset(scope *Scope, input_dataset tf.Output, num_threads tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalPrivateThreadPoolDataset",
+		Input: []tf.Input{
+			input_dataset, num_threads,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ExperimentalParseExampleDatasetAttr is an optional argument to ExperimentalParseExampleDataset.
+type ExperimentalParseExampleDatasetAttr func(optionalAttr)
+
+// ExperimentalParseExampleDatasetSloppy sets the optional sloppy attribute to value.
+// If not specified, defaults to false
+func ExperimentalParseExampleDatasetSloppy(value bool) ExperimentalParseExampleDatasetAttr {
+	return func(m optionalAttr) {
+		m["sloppy"] = value
+	}
+}
+
+// Transforms `input_dataset` containing `Example` protos as vectors of DT_STRING into a dataset of `Tensor` or `SparseTensor` objects representing the parsed features.
+//
+// Arguments:
+//
+//
+//	dense_defaults: A dict mapping string keys to `Tensor`s.
+// The keys of the dict must match the dense_keys of the feature.
+//	sparse_keys: A list of string keys in the examples features.
+// The results for these keys will be returned as `SparseTensor` objects.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples features associated with dense values.
+//	sparse_types: A list of `DTypes` of the same length as `sparse_keys`.
+// Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
+// and `tf.string` (`BytesList`) are supported.
+//	dense_shapes: List of tuples with the same length as `dense_keys`.
+// The shape of the data for each dense feature referenced by `dense_keys`.
+// Required for any input tensors identified by `dense_keys`.  Must be
+// either fully defined, or may contain an unknown first dimension.
+// An unknown first dimension means the feature is treated as having
+// a variable number of blocks, and the output shape along this dimension
+// is considered unknown at graph build time.  Padding is applied for
+// minibatch elements smaller than the maximum number of blocks for the
+// given feature along this dimension.
+//	output_types: The type list for the return values.
+//	output_shapes: The list of shapes being produced.
+func ExperimentalParseExampleDataset(scope *Scope, input_dataset tf.Output, num_parallel_calls tf.Output, dense_defaults []tf.Output, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ExperimentalParseExampleDatasetAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes, "output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalParseExampleDataset",
+		Input: []tf.Input{
+			input_dataset, num_parallel_calls, tf.OutputList(dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
 type SdcaOptimizerAttr func(optionalAttr)
 
@@ -23336,6 +24628,26 @@ func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (pro
 	return op.Output(0)
 }
 
+// Serializes the tree handle to a proto
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource to be serialized.
+//
+// Returns Serialied proto string of the tree resource.
+func TensorForestTreeSerialize(scope *Scope, tree_handle tf.Output) (tree_config tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreeSerialize",
+		Input: []tf.Input{
+			tree_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // SparseMatMulAttr is an optional argument to SparseMatMul.
 type SparseMatMulAttr func(optionalAttr)
 
@@ -26491,6 +27803,28 @@ func Invert(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Deserialize bucket boundaries and ready flag into current QuantileAccumulator.
+//
+// An op that deserializes bucket boundaries and are boundaries ready flag into current QuantileAccumulator.
+//
+// Arguments:
+//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
+//	bucket_boundaries: float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
+//
+// Returns the created operation.
+func BoostedTreesQuantileStreamResourceDeserialize(scope *Scope, quantile_stream_resource_handle tf.Output, bucket_boundaries []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesQuantileStreamResourceDeserialize",
+		Input: []tf.Input{
+			quantile_stream_resource_handle, tf.OutputList(bucket_boundaries),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Inverse 3D fast Fourier transform.
 //
 // Computes the inverse 3-dimensional discrete Fourier transform over the
@@ -27018,6 +28352,29 @@ func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate t
 	return op.Output(0)
 }
 
+// Splits a tensor into a list.
+//
+// list[i] corresponds to lengths[i] tensors from the input tensor.
+// The tensor must have rank at least 1 and contain exactly sum(lengths) elements.
+//
+// tensor: The input tensor.
+// element_shape: A shape compatible with that of elements in the tensor.
+// lengths: Vector of sizes of the 0th dimension of tensors in the list.
+// output_handle: The list.
+func TensorListSplit(scope *Scope, tensor tf.Output, element_shape tf.Output, lengths tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListSplit",
+		Input: []tf.Input{
+			tensor, element_shape, lengths,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // AvgPoolAttr is an optional argument to AvgPool.
 type AvgPoolAttr func(optionalAttr)
 
@@ -27138,6 +28495,26 @@ func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, el
 	return op.Output(0)
 }
 
+// Resizes the list.
+//
+//
+// input_handle: the input list
+// size: size of the output list
+//
+func TensorListResize(scope *Scope, input_handle tf.Output, size tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListResize",
+		Input: []tf.Input{
+			input_handle, size,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns a diagonal tensor with a given diagonal values.
 //
 // Given a `diagonal`, this operation returns a tensor with the `diagonal` and
@@ -27328,7 +28705,7 @@ func TensorListScatter(scope *Scope, tensor tf.Output, indices tf.Output, elemen
 //	limits: The limits of each range.
 //	deltas: The deltas of each range.
 //
-// Returns The `row_splits` for the returned `RaggedTensor`.The `inner_values` for the returned `RaggedTensor`.
+// Returns The `row_splits` for the returned `RaggedTensor`.The `flat_values` for the returned `RaggedTensor`.
 func RaggedRange(scope *Scope, starts tf.Output, limits tf.Output, deltas tf.Output) (rt_nested_splits tf.Output, rt_dense_values tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -27660,6 +29037,66 @@ func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output t
 	return op.Output(0)
 }
 
+// ResourceApplyKerasMomentumAttr is an optional argument to ResourceApplyKerasMomentum.
+type ResourceApplyKerasMomentumAttr func(optionalAttr)
+
+// ResourceApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyKerasMomentumUseLocking(value bool) ResourceApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var + momentum * accum, so in the end, the var you get is actually
+// var + momentum * accum.
+// If not specified, defaults to false
+func ResourceApplyKerasMomentumUseNesterov(value bool) ResourceApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
+//
+// want to use Nesterov momentum.
+//
+// accum = accum * momentum - lr * grad
+// var += accum
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyKerasMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyKerasMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, momentum,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // MatrixSolveAttr is an optional argument to MatrixSolve.
 type MatrixSolveAttr func(optionalAttr)
 
@@ -27729,6 +29166,70 @@ func DatasetToGraph(scope *Scope, input_dataset tf.Output) (graph tf.Output) {
 	return op.Output(0)
 }
 
+// LuAttr is an optional argument to Lu.
+type LuAttr func(optionalAttr)
+
+// LuOutputIdxType sets the optional output_idx_type attribute to value.
+// If not specified, defaults to DT_INT32
+func LuOutputIdxType(value tf.DataType) LuAttr {
+	return func(m optionalAttr) {
+		m["output_idx_type"] = value
+	}
+}
+
+// Computes the LU decomposition of one or more square matrices.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices.
+//
+// The input has to be invertible.
+//
+// The output consists of two tensors LU and P containing the LU decomposition
+// of all input submatrices `[..., :, :]`. LU encodes the lower triangular and
+// upper triangular factors.
+//
+// For each input submatrix of shape `[M, M]`, L is a lower triangular matrix of
+// shape `[M, M]` with unit diagonal whose entries correspond to the strictly lower
+// triangular part of LU. U is a upper triangular matrix of shape `[M, M]` whose
+// entries correspond to the upper triangular part, including the diagonal, of LU.
+//
+// P represents a permutation matrix encoded as a list of indices each between `0`
+// and `M-1`, inclusive. If P_mat denotes the permutation matrix corresponding to
+// P, then the L, U and P satisfies P_mat * input = L * U.
+//
+// Arguments:
+//	input: A tensor of shape `[..., M, M]` whose inner-most 2 dimensions form matrices of
+// size `[M, M]`.
+//
+// Returns A tensor of shape `[..., M, M]` whose strictly lower triangular part denotes the
+// lower triangular factor `L` with unit diagonal, and whose upper triangular part
+// denotes the upper triangular factor `U`.Permutation of the rows encoded as a list of indices in `0..M-1`. Shape is
+// `[..., M]`.
+// @compatibility(scipy)
+// Similar to `scipy.linalg.lu`, except the triangular factors `L` and `U` are
+// packed into a single tensor, the permutation is applied to `input` instead of
+// the right hand side and the permutation `P` is returned as a list of indices
+// instead of a permutation matrix.
+// @end_compatibility
+func Lu(scope *Scope, input tf.Output, optional ...LuAttr) (lu tf.Output, p tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Lu",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Computes the matrix square root of one or more square matrices:
 //
 // matmul(sqrtm(A), sqrtm(A)) = A
@@ -28987,90 +30488,6 @@ func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Outpu
 	return op.Output(0)
 }
 
-// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
-type ResourceApplyAdamAttr func(optionalAttr)
-
-// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, uses the nesterov update.
-// If not specified, defaults to false
-func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update '*var' according to the Adam algorithm.
-//
-// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
-// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
-// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-// $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	beta2_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdam",
-		Input: []tf.Input{
-			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Store the input tensor in the state of the current session.
-//
-// Arguments:
-//	value: The tensor to be stored.
-//
-// Returns The handle for the tensor stored in the session state, represented
-// as a ResourceHandle object.
-func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "GetSessionHandleV2",
-		Input: []tf.Input{
-			value,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
 type ResizeBicubicGradAttr func(optionalAttr)
 
@@ -29948,6 +31365,43 @@ func Iterator(scope *Scope, shared_name string, container string, output_types [
 	return op.Output(0)
 }
 
+// TensorForestTreeResourceHandleOpAttr is an optional argument to TensorForestTreeResourceHandleOp.
+type TensorForestTreeResourceHandleOpAttr func(optionalAttr)
+
+// TensorForestTreeResourceHandleOpContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func TensorForestTreeResourceHandleOpContainer(value string) TensorForestTreeResourceHandleOpAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TensorForestTreeResourceHandleOpSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func TensorForestTreeResourceHandleOpSharedName(value string) TensorForestTreeResourceHandleOpAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a handle to a TensorForestTreeResource
+func TensorForestTreeResourceHandleOp(scope *Scope, optional ...TensorForestTreeResourceHandleOpAttr) (resource tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreeResourceHandleOp",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
 type CropAndResizeGradImageAttr func(optionalAttr)
 
@@ -30312,6 +31766,29 @@ func FakeParam(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Outpu
 	return op.Output(0)
 }
 
+// Returns the next representable value of `x1` in the direction of `x2`, element-wise.
+//
+// This operation returns the same result as the C++ std::nextafter function.
+//
+// It can also return a subnormal number.
+//
+// @compatibility(cpp)
+// Equivalent to C++ std::nextafter function.
+// @end_compatibility
+func NextAfter(scope *Scope, x1 tf.Output, x2 tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NextAfter",
+		Input: []tf.Input{
+			x1, x2,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the gradient for the inverse of `x` wrt its input.
 //
 // Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
@@ -30461,6 +31938,71 @@ func BoostedTreesQuantileStreamResourceAddSummaries(scope *Scope, quantile_strea
 	return scope.AddOperation(opspec)
 }
 
+// Creates a Dataset that returns pseudorandom numbers.
+//
+// Arguments:
+//	seed: A scalar seed for the random number generator. If either seed or
+// seed2 is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
+//
+//
+func ExperimentalRandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalRandomDataset",
+		Input: []tf.Input{
+			seed, seed2,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// A dataset that splits the elements of its input into multiple elements.
+func ExperimentalUnbatchDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalUnbatchDataset",
+		Input: []tf.Input{
+			input_dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that overrides the maximum intra-op parallelism.
+//
+// Arguments:
+//
+//	max_intra_op_parallelism: Identifies the maximum intra-op parallelism to use.
+//
+//
+func ExperimentalMaxIntraOpParallelismDataset(scope *Scope, input_dataset tf.Output, max_intra_op_parallelism tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalMaxIntraOpParallelismDataset",
+		Input: []tf.Input{
+			input_dataset, max_intra_op_parallelism,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // StringSplitV2Attr is an optional argument to StringSplitV2.
 type StringSplitV2Attr func(optionalAttr)
 
@@ -30823,6 +32365,83 @@ func DeserializeIterator(scope *Scope, resource_handle tf.Output, serialized tf.
 	return scope.AddOperation(opspec)
 }
 
+// ResourceScatterNdSubAttr is an optional argument to ResourceScatterNdSub.
+type ResourceScatterNdSubAttr func(optionalAttr)
+
+// ResourceScatterNdSubUseLocking sets the optional use_locking attribute to value.
+//
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdSubUseLocking(value bool) ResourceScatterNdSubAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Applies sparse subtraction to individual values or slices in a Variable.
+//
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+// ```
+//
+// For example, say we want to subtract 4 scattered elements from a rank-1 tensor
+// with 8 elements. In Python, that subtraction would look like this:
+//
+// ```python
+// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+// indices = tf.constant([[4], [3], [1], [7]])
+// updates = tf.constant([9, 10, 11, 12])
+// sub = tf.scatter_nd_sub(ref, indices, updates)
+// with tf.Session() as sess:
+//   print sess.run(sub)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, -9, 3, -6, -4, 6, 7, -4]
+//
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
+//
+// Arguments:
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of
+// values to add to ref.
+//
+// Returns the created operation.
+func ResourceScatterNdSub(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdSubAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterNdSub",
+		Input: []tf.Input{
+			ref, indices, updates,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // TensorArrayConcatV2Attr is an optional argument to TensorArrayConcatV2.
 type TensorArrayConcatV2Attr func(optionalAttr)
 
@@ -31032,6 +32651,43 @@ func TFRecordDataset(scope *Scope, filenames tf.Output, compression_type tf.Outp
 	return op.Output(0)
 }
 
+// ExperimentalStatsAggregatorHandleAttr is an optional argument to ExperimentalStatsAggregatorHandle.
+type ExperimentalStatsAggregatorHandleAttr func(optionalAttr)
+
+// ExperimentalStatsAggregatorHandleContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func ExperimentalStatsAggregatorHandleContainer(value string) ExperimentalStatsAggregatorHandleAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// ExperimentalStatsAggregatorHandleSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func ExperimentalStatsAggregatorHandleSharedName(value string) ExperimentalStatsAggregatorHandleAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a statistics manager resource.
+func ExperimentalStatsAggregatorHandle(scope *Scope, optional ...ExperimentalStatsAggregatorHandleAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalStatsAggregatorHandle",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // A container for an iterator resource.
 //
 // Returns A handle to the iterator that can be passed to a "MakeIterator" or
@@ -31157,6 +32813,21 @@ func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int
 	return op.Output(0)
 }
 
+// Produces a summary of any statistics recorded by the given statistics manager.
+func ExperimentalStatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalStatsAggregatorSummary",
+		Input: []tf.Input{
+			iterator,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Makes a new iterator from the given `dataset` and stores it in `iterator`.
 //
 // This operation may be executed multiple times. Each execution will reset the
@@ -31488,6 +33159,26 @@ func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQu
 	return op.Output(0)
 }
 
+// Deserializes a proto into the tree handle
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource to be restored.
+//	tree_config: Serialied proto string of the boosted_trees.Tree proto.
+//
+// Returns the created operation.
+func TensorForestTreeDeserialize(scope *Scope, tree_handle tf.Output, tree_config tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreeDeserialize",
+		Input: []tf.Input{
+			tree_handle, tree_config,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Constructs an Optional variant from a tuple of tensors.
 func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output) {
 	if scope.Err() != nil {
@@ -31705,9 +33396,9 @@ func IteratorGetNextAsOptional(scope *Scope, iterator tf.Output, output_types []
 // dimension of `input`.
 //
 // Arguments:
-//	input: A complex64 tensor.
+//	input: A complex tensor.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most
+// Returns A complex tensor of the same shape as `input`. The inner-most
 //   dimension of `input` is replaced with its 1D Fourier transform.
 //
 // @compatibility(numpy)
@@ -31820,6 +33511,19 @@ func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
 	}
 }
 
+// Conv2DBackpropInputExplicitPaddings sets the optional explicit_paddings attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DBackpropInputExplicitPaddings(value []int64) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
 // Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
 //
 // value: Specify the data format of the input and output data. With the
@@ -32418,6 +34122,28 @@ func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.O
 	return op.Output(0), op.Output(1)
 }
 
+// Writes the given dataset to the given file using the TFRecord format.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the dataset to write.
+//	filename: A scalar string tensor representing the filename to use.
+//	compression_type: A scalar string tensor containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//
+// Returns the created operation.
+func ExperimentalDatasetToTFRecord(scope *Scope, input_dataset tf.Output, filename tf.Output, compression_type tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalDatasetToTFRecord",
+		Input: []tf.Input{
+			input_dataset, filename, compression_type,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // QueueCloseV2Attr is an optional argument to QueueCloseV2.
 type QueueCloseV2Attr func(optionalAttr)
 
@@ -32754,6 +34480,23 @@ func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, o
 	return op.Output(0)
 }
 
+// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
+func ExperimentalBytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalBytesProducedStatsDataset",
+		Input: []tf.Input{
+			input_dataset, tag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // StackPushV2Attr is an optional argument to StackPushV2.
 type StackPushV2Attr func(optionalAttr)
 
@@ -33588,6 +35331,19 @@ func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
 	}
 }
 
+// Conv2DExplicitPaddings sets the optional explicit_paddings attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DExplicitPaddings(value []int64) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
 // Conv2DDataFormat sets the optional data_format attribute to value.
 //
 // value: Specify the data format of the input and output data. With the
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 10808e162ee4cc679430c0573e5bff8322ad6fff..af5503f2ad308fffb03d2ebd5964eec273896c72 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -295,6 +295,19 @@ tf_java_test(
     ],
 )
 
+tf_java_test(
+    name = "GeneratedOperationsTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/op/core/GeneratedOperationsTest.java"],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.op.core.GeneratedOperationsTest",
+    deps = [
+        ":tensorflow",
+        ":testutil",
+        "@junit",
+    ],
+)
+
 tf_java_test(
     name = "GradientsTest",
     size = "small",
diff --git a/tensorflow/java/src/gen/cc/op_gen_main.cc b/tensorflow/java/src/gen/cc/op_gen_main.cc
index 0d9e0883af262ee1f262a5e1308cb9df8763488d..cf4bb03dadec421411300100880f9129d7da47be 100644
--- a/tensorflow/java/src/gen/cc/op_gen_main.cc
+++ b/tensorflow/java/src/gen/cc/op_gen_main.cc
@@ -35,7 +35,7 @@ const char kUsageHeader[] =
     "graph.\n\n"
     "Operation wrappers are generated under the path specified by the "
     "'--output_dir' argument. This path can be absolute or relative to the\n"
-    "current working directory and will be created if it does not exists.\n\n"
+    "current working directory and will be created if it does not exist.\n\n"
     "Note that the operations will not be available through the "
     "'org.tensorflow.op.Ops' API until the generated classes are compiled\n"
     "using an appropriate annotation processor.\n\n"
diff --git a/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java b/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
index 1b7bcdab35f45142aefdc9e9635b398090e60b17..df1426ad75143d720f1d5bd3cf4ce44d30cb226e 100644
--- a/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
+++ b/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
@@ -340,7 +340,7 @@ public final class OperatorProcessor extends AbstractProcessor {
                     + "{@link $T @Operator} is exposed\n"
                     + "by this API or one of its subgroup.\n<p>Example usage:\n<pre>{@code\n"
                     + "try (Graph g = new Graph()) {\n"
-                    + "  Ops ops = new Ops(g);\n"
+                    + "  Ops ops = Ops.create(g);\n"
                     + "  // Operations are typed classes with convenience\n"
                     + "  // builders in Ops.\n"
                     + "  Constant three = ops.constant(3);\n"
diff --git a/tensorflow/java/src/main/java/org/tensorflow/op/annotation/Operator.java b/tensorflow/java/src/main/java/org/tensorflow/op/annotation/Operator.java
index 3782240edb4008cc71c55cf48cba8f5873b71018..38f466c57416eac96a09cd1dfe8558fcb8e3606f 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/op/annotation/Operator.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/op/annotation/Operator.java
@@ -25,11 +25,11 @@ import java.lang.annotation.Target;
  * Annotation used by classes to make TensorFlow operations conveniently accessible via {@code
  * org.tensorflow.op.Ops}.
  *
- * <p>An annotation processor (TODO: not yet implemented) builds the {@code Ops} class by
- * aggregating all classes annotated as {@code @Operator}s. Each annotated class <b>must</b> have at
- * least one public static factory method named {@code create} that accepts a {@link
- * org.tensorflow.op.Scope} as its first argument. The processor then adds a convenience method in
- * the {@code Ops} class. For example:
+ * <p>An annotation processor ({@code org.tensorflow.processor.OperatorProcessor}) builds the
+ * {@code Ops} class by aggregating all classes annotated as {@code @Operator}s. Each annotated
+ * class <b>must</b> have at least one public static factory method named {@code create} that
+ * accepts a {@link org.tensorflow.op.Scope} as its first argument. The processor then adds a
+ * convenience method in the {@code Ops} class. For example:
  *
  * <pre>{@code
  * @Operator
@@ -45,7 +45,7 @@ import java.lang.annotation.Target;
  * <pre>{@code
  * import org.tensorflow.op.Ops;
  * ...
- * Ops ops = new Ops(graph);
+ * Ops ops = Ops.create(graph);
  * ...
  * ops.myOp(operand);
  * // and has exactly the same effect as calling
diff --git a/tensorflow/java/src/test/java/org/tensorflow/op/core/GeneratedOperationsTest.java b/tensorflow/java/src/test/java/org/tensorflow/op/core/GeneratedOperationsTest.java
new file mode 100644
index 0000000000000000000000000000000000000000..42d126c3c422884031c6b5e5ee9ef2ba3b0207f5
--- /dev/null
+++ b/tensorflow/java/src/test/java/org/tensorflow/op/core/GeneratedOperationsTest.java
@@ -0,0 +1,43 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.op.core;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.tensorflow.Graph;
+import org.tensorflow.Operand;
+import org.tensorflow.Session;
+import org.tensorflow.Tensor;
+import org.tensorflow.op.Ops;
+
+@RunWith(JUnit4.class)
+public final class GeneratedOperationsTest {
+
+  @Test
+  public void tensorInputTensorOutput() {
+    try (Graph g = new Graph();
+        Session sess = new Session(g)) {
+      Ops ops = Ops.create(g);
+      Operand<Integer> x = ops.math().add(ops.constant(1), ops.constant(2));
+      try (Tensor<Integer> result = sess.runner().fetch(x).run().get(0).expect(Integer.class)) {
+        assertEquals(3, result.intValue());
+      }
+    }
+  }
+}
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index c17eddf47bc86c9537364117c302df38e390c8da..343ec60f2a6cb947387d6f9236530dcc1e035d7e 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -311,6 +311,7 @@ def generated_test_models():
         "topk",
         "transpose",
         "transpose_conv",
+        "unique",
         "unpack",
         "unroll_batch_matmul",
         "where",
diff --git a/tensorflow/lite/builtin_ops.h b/tensorflow/lite/builtin_ops.h
index f97d3ac4bf0b27cdd9b1f5ab7258a12036c29179..ce73aa0f9cddbe53021dc2e6fd515fc1606b9469 100644
--- a/tensorflow/lite/builtin_ops.h
+++ b/tensorflow/lite/builtin_ops.h
@@ -128,6 +128,7 @@ typedef enum {
   kTfLiteBuiltinMirrorPad = 100,
   kTfLiteBuiltinAbs = 101,
   kTfLiteBuiltinSplitV = 102,
+  kTfLiteBuiltinUnique = 103,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/lite/c/builtin_op_data.h b/tensorflow/lite/c/builtin_op_data.h
index 58e7221bc6e5a9d062127e30d0007838563db76e..332c2db14511af18a8e3d99fc93891ce92d1792a 100644
--- a/tensorflow/lite/c/builtin_op_data.h
+++ b/tensorflow/lite/c/builtin_op_data.h
@@ -25,6 +25,11 @@ extern "C" {
 
 // TODO(aselle): Consider using "if this then that" for testing.
 
+// Useful placeholder to put in otherwise empty structs to avoid size warnings.
+typedef struct {
+  char dummy;
+} EmptyStructPlaceholder;
+
 // IMPORTANT: All new members of structs must be added at the end to ensure
 // backwards compatibility.
 
@@ -152,9 +157,11 @@ typedef struct {
 } TfLiteAddParams;
 
 typedef struct {
+  EmptyStructPlaceholder placeholder;
 } TfLiteSpaceToBatchNDParams;
 
 typedef struct {
+  EmptyStructPlaceholder placeholder;
 } TfLiteBatchToSpaceNDParams;
 
 typedef struct {
@@ -230,9 +237,11 @@ typedef struct {
 } TfLiteResizeNearestNeighborParams;
 
 typedef struct {
+  EmptyStructPlaceholder placeholder;
 } TfLitePadParams;
 
 typedef struct {
+  EmptyStructPlaceholder placeholder;
 } TfLitePadV2Params;
 
 typedef struct {
@@ -272,6 +281,7 @@ typedef struct {
 } TfLiteGatherParams;
 
 typedef struct {
+  EmptyStructPlaceholder placeholder;
 } TfLiteTransposeParams;
 
 typedef struct {
@@ -351,6 +361,10 @@ typedef struct {
   float alpha;
 } TfLiteLeakyReluParams;
 
+typedef struct {
+  TfLiteType index_out_type;
+} TfLiteUniqueParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index e73c4ce023d7ecde7f8422cf3e2709f45b35b621..970e45bbdcd5c5d582c0cab29ea89c657987c70d 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <cstdlib>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 
@@ -651,6 +653,18 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_UNIQUE: {
+      TfLiteUniqueParams* params = allocator->AllocatePOD<TfLiteUniqueParams>();
+      auto* unique_params = op->builtin_options_as_UniqueOptions();
+      if (unique_params != nullptr) {
+        params->index_out_type =
+            unique_params->idx_out_type() == tflite::TensorType_INT64
+                ? TfLiteType::kTfLiteInt64
+                : TfLiteType::kTfLiteInt32;
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
 
     // Below are the ops with no builtin_data strcture.
     case BuiltinOperator_ABS:
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 75212cac22bd216b23457fc10e1cbd9d30a34319..4be80d143e7d34b45be45e06e42519afe4d32827 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -670,7 +670,7 @@ TfLiteStatus Subgraph::Invoke() {
       TfLiteTensor* tensor = &tensors_[tensor_index];
       if (tensor->delegate && tensor->delegate != node.delegate &&
           tensor->data_is_stale) {
-        EnsureTensorDataIsReadable(tensor_index);
+        TF_LITE_ENSURE_STATUS(EnsureTensorDataIsReadable(tensor_index));
       }
     }
 
@@ -683,8 +683,8 @@ TfLiteStatus Subgraph::Invoke() {
     EnsureTensorsVectorCapacity();
     tensor_resized_since_op_invoke_ = false;
     if (OpInvoke(registration, &node) == kTfLiteError) {
-      status = ReportOpError(context_, node, registration, node_index,
-                             "failed to invoke");
+      return ReportOpError(context_, node, registration, node_index,
+                           "failed to invoke");
     }
 
     // Force execution prep for downstream ops if the latest op triggered the
diff --git a/tensorflow/lite/delegates/flex/buffer_map.h b/tensorflow/lite/delegates/flex/buffer_map.h
index 6c1df4c8362e487e52885b06369b2e435e0a014f..d4724a011dafa153bd9391549ee1c65914def2ce 100644
--- a/tensorflow/lite/delegates/flex/buffer_map.h
+++ b/tensorflow/lite/delegates/flex/buffer_map.h
@@ -66,6 +66,9 @@ class BufferMap {
   // be use by TF's forwarding optimizations.
   void SetForwardable(int tensor_index) { forwardable_.insert(tensor_index); }
 
+  // Removes all information about which tensors are forwardable.
+  void ClearForwardable() { forwardable_.clear(); }
+
   // Returns true if this tensor has been explicitly marks as forwardable by
   // a call to SetForwardable().
   bool IsForwardable(int tensor_index) const {
diff --git a/tensorflow/lite/delegates/flex/buffer_map_test.cc b/tensorflow/lite/delegates/flex/buffer_map_test.cc
index 2148bfe8e229b2f7c59cd90f0632e68479159049..7b4acbd69d0fed3f1ef80b9af2ee779dae762a92 100644
--- a/tensorflow/lite/delegates/flex/buffer_map_test.cc
+++ b/tensorflow/lite/delegates/flex/buffer_map_test.cc
@@ -278,6 +278,8 @@ TEST(BufferMapTest, Forwardable) {
   EXPECT_FALSE(buffer_map.IsForwardable(0));
   buffer_map.SetForwardable(0);
   EXPECT_TRUE(buffer_map.IsForwardable(0));
+  buffer_map.ClearForwardable();
+  EXPECT_FALSE(buffer_map.IsForwardable(0));
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc
index d6e12ef6503eb13d63d27420e6686319bf006ea7..2e0fc22ad6872884d04da4c2d2f8a4dce0246de9 100644
--- a/tensorflow/lite/delegates/flex/kernel.cc
+++ b/tensorflow/lite/delegates/flex/kernel.cc
@@ -51,108 +51,306 @@ namespace tflite {
 namespace flex {
 namespace kernel {
 
-// Controls the lifetime of tensor handles in a vector.
-class VectorOfHandles {
+struct OpNode;
+
+// Represents the origin of a given tensor as a reference to the output
+// of an upstream node.
+struct TensorSource {
+  OpNode* node;
+  int node_output_index;
+};
+
+// A list of inputs of a given node of the TensorFlow/Eager graph.
+class OpInputs {
  public:
-  explicit VectorOfHandles(int num_elements) : vector_(num_elements, nullptr) {}
+  explicit OpInputs(const TfLiteIntArray* indexes) {
+    for (int index : TfLiteIntArrayView(indexes)) {
+      inputs_.push_back(index);
+    }
+  }
+  ~OpInputs() {}
+
+  int Size() const { return inputs_.size(); }
+
+  int TfLiteIndex(int i) const { return inputs_[i]; }
+
+  // Given a map relating tensors to the node that originates them, populate a
+  // list of sources for the tensors in this class.
+  void InitializeTensorSources(
+      const std::map<int, TensorSource>& tflite_tensor_sources) {
+    sources_.clear();
+    for (int i : inputs_) {
+      auto it = tflite_tensor_sources.find(i);
+      if (it == tflite_tensor_sources.end()) {
+        sources_.push_back({nullptr, 0});
+      } else {
+        sources_.push_back(it->second);
+      }
+    }
+  }
+
+  TensorSource GetTensorSource(int i) const { return sources_[i]; }
 
-  ~VectorOfHandles() {
-    for (auto* handle : vector_) {
-      if (handle) handle->Unref();
+ private:
+  std::vector<int> inputs_;
+  std::vector<TensorSource> sources_;
+};
+
+// A list of outputs of a given node of the TensorFlow/Eager graph, along with
+// the actual outputs of the EagerOperation.
+class OpOutputs {
+ public:
+  explicit OpOutputs(const TfLiteIntArray* indexes) {
+    for (int index : TfLiteIntArrayView(indexes)) {
+      outputs_.push_back(index);
+    }
+    vector_.resize(outputs_.size());
+  }
+  ~OpOutputs() { ResetTensorHandles(); }
+
+  // Stores information about which of the tensors in this class are also
+  // outputs of the sugbraph.
+  void InitializeGraphOutputs(const std::set<int>& subgraph_outputs) {
+    subgraph_outputs_.clear();
+    for (int i : outputs_) {
+      subgraph_outputs_.push_back(subgraph_outputs.count(i) > 0);
     }
   }
 
-  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2>* GetVector() {
-    return &vector_;
+  // Returns true if the tensor given by index 'i' is an output of the entire
+  // subgraph.
+  bool IsSubgraphOutput(int i) const { return subgraph_outputs_[i]; }
+
+  // Returns a handle to a given tensor and, optionally, remove it from the
+  // internal vector.
+  tensorflow::TensorHandle* GetHandle(int i, bool remove) {
+    auto* handle = vector_[i];
+    if (!remove) {
+      handle->Ref();
+    } else {
+      // Don't increase the ref-count. Instead, simply take it out of the
+      // vector.
+      vector_[i] = nullptr;
+    }
+    return handle;
   }
 
-  tensorflow::TensorHandle* GetHandle(int index) { return vector_[index]; }
+  int Size() const { return outputs_.size(); }
+
+  int TfLiteIndex(int i) const { return outputs_[i]; }
+
+  // Carefully unreference all the handles in the eager output vector.
+  void ResetTensorHandles() {
+    for (int i = 0; i < vector_.size(); ++i) {
+      if (vector_[i]) {
+        vector_[i]->Unref();
+        vector_[i] = nullptr;
+      }
+    }
+  }
+
+  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2>*
+  GetTensorHandles() {
+    return &vector_;
+  }
 
  private:
+  std::vector<int> outputs_;
+  std::vector<bool> subgraph_outputs_;
   tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> vector_;
 };
 
-// Executes the TensorFlow op given by 'op_name', with the attributes specified
-// in 'nodedef'. Inputs and outputs are given as indices into the 'buffer_map'.
-tensorflow::Status ExecuteFlexOp(tensorflow::EagerContext* eager_context,
-                                 BufferMap* buffer_map, const string& op_name,
-                                 const tensorflow::NodeDef& nodedef,
-                                 const std::vector<int>& inputs,
-                                 const std::vector<int>& outputs) {
-  const tensorflow::AttrTypeMap* attr_types;
-  bool is_function = false;
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      tensorflow::AttrTypeMapForOp(op_name.c_str(), &attr_types, &is_function),
-      " (while processing attributes of '", op_name, "')");
-  if (is_function) {
-    return tensorflow::errors::NotFound(
-        "Operation '", op_name,
-        "' is not registered.  (while processing attributes of '", op_name,
-        "')");
-  }
-  tensorflow::EagerOperation op(eager_context, op_name.c_str(),
-                                /*is_function=*/false, attr_types);
-  for (const auto& attr : nodedef.attr()) {
-    op.MutableAttrs()->Set(attr.first, attr.second);
+// A single node within the larger 'op'. Note that this kernel executes many
+// TensorFlow ops within a single TF Lite op.
+class OpNode {
+ public:
+  OpNode(const TfLiteIntArray* inputs, const TfLiteIntArray* outputs)
+      : inputs_(inputs), outputs_(outputs) {}
+  ~OpNode() {
+    if (op_) ClearEagerInputs();
   }
 
-  for (int input_index : inputs) {
-    if (!buffer_map->HasTensor(input_index)) {
+  const string& name() const { return name_; }
+  void set_name(const string& name) { name_ = name; }
+
+  int index() const { return index_; }
+  void set_index(int index) { index_ = index; }
+
+  const tensorflow::NodeDef& nodedef() const { return nodedef_; }
+
+  const OpInputs& inputs() const { return inputs_; }
+  OpInputs* mutable_inputs() { return &inputs_; }
+
+  const OpOutputs& outputs() const { return outputs_; }
+  OpOutputs* mutable_outputs() { return &outputs_; }
+
+  int NumInputs() const { return inputs_.Size(); }
+  int NumOutputs() const { return outputs_.Size(); }
+
+  tensorflow::EagerOperation* op() { return op_.get(); }
+
+  tensorflow::Status InitializeNodeDef(const void* custom_initial_data,
+                                       int custom_initial_data_size) {
+    if (!custom_initial_data) {
       return tensorflow::errors::Internal(
-          "Cannot read from invalid tensor index ", input_index);
+          "Cannot convert empty data into a valid NodeDef");
     }
-    auto* handle = new tensorflow::TensorHandle(
-        buffer_map->GetTensor(input_index), nullptr, nullptr, nullptr);
-    op.AddInput(handle);
-    handle->Unref();
-
-    if (buffer_map->IsForwardable(input_index)) {
-      // Take it out of the map, so Eager/TF can reuse the buffer for an output
-      // tensor of the op.
-      buffer_map->RemoveTensor(input_index);
+    // The flexbuffer contains a vector where the first elements is the
+    // op name and the second is a serialized NodeDef.
+    const flexbuffers::Vector& v =
+        flexbuffers::GetRoot(
+            reinterpret_cast<const uint8_t*>(custom_initial_data),
+            custom_initial_data_size)
+            .AsVector();
+
+    name_ = v[0].AsString().str();
+    if (!nodedef_.ParseFromString(v[1].AsString().str())) {
+      nodedef_.Clear();
+      return tensorflow::errors::Internal(
+          "Failed to parse data into a valid NodeDef");
     }
+
+    // Fill NodeDef with defaults if it's a valid op.
+    const tensorflow::OpRegistrationData* op_reg_data;
+    TF_RETURN_IF_ERROR(
+        tensorflow::OpRegistry::Global()->LookUp(nodedef_.op(), &op_reg_data));
+    AddDefaultsToNodeDef(op_reg_data->op_def, &nodedef_);
+
+    return tensorflow::Status::OK();
   }
 
-  int num_retvals = outputs.size();
-  VectorOfHandles retvals(num_retvals);
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      EagerExecute(&op, retvals.GetVector(), &num_retvals),
-      " (while executing '", op_name, "' via Eager)");
+  // Build thew new EagerOperation. In case of error, the returned 'op' is
+  // guaranteed to be 'nullptr'.
+  tensorflow::Status BuildEagerOp(tensorflow::EagerContext* eager_context) {
+    op_.reset();
+
+    const tensorflow::AttrTypeMap* attr_types;
+    bool is_function = false;
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(
+        tensorflow::AttrTypeMapForOp(name_.c_str(), &attr_types, &is_function),
+        " (while processing attributes of '", name_, "')");
+    if (is_function) {
+      return tensorflow::errors::NotFound(
+          "Operation '", name_,
+          "' is not registered.  (while processing attributes of '", name_,
+          "')");
+    }
 
-  if (num_retvals != outputs.size()) {
-    return tensorflow::errors::Internal(
-        "Unexpected number of outputs from EagerExecute");
+    op_.reset(new tensorflow::EagerOperation(eager_context, name_.c_str(),
+                                             /*is_function=*/false,
+                                             attr_types));
+
+    op_->MutableAttrs()->NumInputs(inputs_.Size());
+    for (const auto& attr : nodedef_.attr()) {
+      op_->MutableAttrs()->Set(attr.first, attr.second);
+    }
+
+    // Precalculating a cache key saves about 10% of inference time for very
+    // small models.
+    tensorflow::Device* device = op_->Device();
+    op_->MutableAttrs()->CacheKey(device == nullptr ? "unspecified"
+                                                    : device->name());
+
+    return tensorflow::Status::OK();
+  }
+
+  void ClearEagerInputs() {
+    for (tensorflow::TensorHandle* h : *op_->MutableInputs()) {
+      if (h) h->Unref();
+    }
+    op_->MutableInputs()->clear();
   }
 
-  for (int i = 0; i < num_retvals; ++i) {
-    const tensorflow::Tensor* tensor = nullptr;
-    TF_RETURN_IF_ERROR(retvals.GetHandle(i)->Tensor(&tensor));
-    buffer_map->SetFromTensorFlow(outputs[i], *tensor);
+  tensorflow::Status BuildEagerInputs(const BufferMap* buffer_map) {
+    for (int i = 0; i < inputs_.Size(); ++i) {
+      int input_index = inputs_.TfLiteIndex(i);
+      TensorSource s = inputs_.GetTensorSource(i);
+      if (!s.node) {
+        // This input is not produced by this Eager subgraph (it could be a TF
+        // Lite native buffer, or could be produced by a separater subgraph). We
+        // need to fetch it from the delegate's buffer_map.
+        if (!buffer_map->HasTensor(input_index)) {
+          return tensorflow::errors::Internal(
+              "Cannot read from invalid tensor index ", input_index);
+        }
+        auto* handle = new tensorflow::TensorHandle(
+            buffer_map->GetTensor(input_index), nullptr, nullptr, nullptr);
+        op_->MutableInputs()->push_back(handle);
+      } else {
+        // If this is a forwardable tensor, we will remove it from the previous
+        // op's list, giving TF the opportunity to reuse its buffer.
+        bool unref_handle = buffer_map->IsForwardable(input_index);
+        auto* handle =
+            s.node->outputs_.GetHandle(s.node_output_index, unref_handle);
+        op_->MutableInputs()->push_back(handle);
+      }
+    }
+    return tensorflow::Status::OK();
   }
 
-  return tensorflow::Status::OK();
-}
+  tensorflow::Status PersistEagerOutputs(BufferMap* buffer_map) {
+    auto* handles = outputs_.GetTensorHandles();
+    for (int i = 0; i < outputs_.Size(); ++i) {
+      if (outputs_.IsSubgraphOutput(i)) {
+        const tensorflow::Tensor* tensor = nullptr;
+        TF_RETURN_IF_ERROR(handles->at(i)->Tensor(&tensor));
+        buffer_map->SetFromTensorFlow(outputs_.TfLiteIndex(i), *tensor);
+      }
+    }
+    return tensorflow::Status::OK();
+  }
+
+ private:
+  OpNode(const OpNode&) = delete;
+  OpNode& operator=(const OpNode&) = delete;
 
-// A single node within the larger 'op'. Note that this kernel executes many
-// TensorFlow ops within a single TF Lite op.
-struct OpNode {
   // The name of the TensorFlow op to execute.
-  string name;
+  string name_;
   // Index of this node into TF Lite's operator list.
-  int index;
+  int index_;
   // The corresponding NodeDef, containing the attributes for the op.
-  tensorflow::NodeDef nodedef;
+  tensorflow::NodeDef nodedef_;
   // List of inputs, as TF Lite tensor indices.
-  std::vector<int> inputs;
+  OpInputs inputs_;
   // List of outputs, as TF Lite tensor indices.
-  std::vector<int> outputs;
+  OpOutputs outputs_;
+
+  std::unique_ptr<tensorflow::EagerOperation> op_;
 };
 
-// The Larger 'op', which contains all the nodes in a supported subgraph.
+// Executes the TensorFlow op given by 'op_name', with the attributes specified
+// in 'nodedef'. Inputs and outputs are given as indices into the 'buffer_map'.
+tensorflow::Status ExecuteFlexOp(TfLiteContext* context, BufferMap* buffer_map,
+                                 OpNode* node_data) {
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(node_data->BuildEagerInputs(buffer_map),
+                                  " (while executing '", node_data->name(),
+                                  "' via Eager)");
+
+  node_data->mutable_outputs()->ResetTensorHandles();
+  int num_retvals = node_data->NumOutputs();
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      EagerExecute(node_data->op(),
+                   node_data->mutable_outputs()->GetTensorHandles(),
+                   &num_retvals),
+      " (while executing '", node_data->name(), "' via Eager)");
+
+  if (num_retvals != node_data->NumOutputs()) {
+    return tensorflow::errors::Internal(
+        "Unexpected number of outputs from EagerExecute");
+  }
+
+  TF_RETURN_IF_ERROR(node_data->PersistEagerOutputs(buffer_map));
+
+  node_data->ClearEagerInputs();
+
+  return tensorflow::Status::OK();
+}
+
+// The larger 'op', which contains all the nodes in a supported subgraph.
 struct OpData {
   tensorflow::EagerContext* eager_context;
   BufferMap* buffer_map;
-  std::vector<OpNode> nodes;
+  std::vector<std::unique_ptr<OpNode>> nodes;
   std::vector<int> subgraph_inputs;
   std::vector<int> subgraph_outputs;
 };
@@ -172,8 +370,10 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
                             ->GetBufferMap(context);
 
   CHECK(params->output_tensors);
+  std::set<int> output_set;
   for (auto tensor_index : TfLiteIntArrayView(params->output_tensors)) {
     op_data->subgraph_outputs.push_back(tensor_index);
+    output_set.insert(tensor_index);
   }
 
   CHECK(params->input_tensors);
@@ -181,49 +381,55 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
     op_data->subgraph_inputs.push_back(tensor_index);
   }
 
+  op_data->nodes.reserve(params->nodes_to_replace->size);
+
   CHECK(params->nodes_to_replace);
+  tensorflow::Status status;
   for (auto node_index : TfLiteIntArrayView(params->nodes_to_replace)) {
     TfLiteNode* node;
     TfLiteRegistration* reg;
     context->GetNodeAndRegistration(context, node_index, &node, &reg);
 
-    op_data->nodes.push_back(OpNode());
-    OpNode& node_data = op_data->nodes.back();
-
-    node_data.index = node_index;
-    node_data.name = "";
-    if (node->custom_initial_data) {
-      // The flexbuffer contains a vector where the first elements is the
-      // op name and the second is a serialized NodeDef.
-      const flexbuffers::Vector& v =
-          flexbuffers::GetRoot(
-              reinterpret_cast<const uint8_t*>(node->custom_initial_data),
-              node->custom_initial_data_size)
-              .AsVector();
-
-      node_data.name = v[0].AsString().str();
-      if (!node_data.nodedef.ParseFromString(v[1].AsString().str())) {
-        // We will just leave the nodedef empty and error out in Eval().
-        node_data.nodedef.Clear();
-      }
-    }
+    op_data->nodes.emplace_back(new OpNode(node->inputs, node->outputs));
+    OpNode& node_data = *op_data->nodes.back();
 
-    // Fill NodeDef with defaults if it's a valid op.
-    const tensorflow::OpRegistrationData* op_reg_data;
-    auto tf_status = tensorflow::OpRegistry::Global()->LookUp(
-        node_data.nodedef.op(), &op_reg_data);
-    if (tf_status.ok()) {
-      AddDefaultsToNodeDef(op_reg_data->op_def, &node_data.nodedef);
-    }
+    node_data.set_index(node_index);
+    node_data.set_name("");
 
-    for (auto input_index : TfLiteIntArrayView(node->inputs)) {
-      node_data.inputs.push_back(input_index);
-    }
-    for (auto output_index : TfLiteIntArrayView(node->outputs)) {
-      node_data.outputs.push_back(output_index);
+    status = node_data.InitializeNodeDef(node->custom_initial_data,
+                                         node->custom_initial_data_size);
+    if (!status.ok()) break;
+    status = node_data.BuildEagerOp(op_data->eager_context);
+    if (!status.ok()) break;
+  }
+
+  if (ConvertStatus(context, status) != kTfLiteOk) {
+    // We can't return an error from this function but ConvertStatus will
+    // report them and we will stop processing in Prepare() if anything went
+    // wrong.
+    return op_data;
+  }
+
+  // Given a TfLite tensor index, return the OpNode that produces it,
+  // along with it index into that OpNodes list of outputs.
+  std::map<int, TensorSource> tflite_tensor_sources;
+
+  // Find out how each tensor is produced. This does not account for
+  // tensors that are not produce by eager ops.
+  for (auto& node_data : op_data->nodes) {
+    node_data->mutable_outputs()->InitializeGraphOutputs(output_set);
+    for (int i = 0; i < node_data->outputs().Size(); ++i) {
+      int output_index = node_data->outputs().TfLiteIndex(i);
+      tflite_tensor_sources[output_index] = TensorSource{node_data.get(), i};
     }
   }
 
+  // For each node, resolve the inputs, so we can keep pointers to the nodes
+  // that produces them.
+  for (auto& node_data : op_data->nodes) {
+    node_data->mutable_inputs()->InitializeTensorSources(tflite_tensor_sources);
+  }
+
   return op_data;
 }
 
@@ -252,7 +458,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         buffer_map->SetFromTfLite(tensor_index, tensor);
       }
     }
-    ++tensor_ref_count[tensor_index];
+
+    // Input tensors should never be forwarded so we increment their ref counts
+    // twice: once for this graph and another for the possibility of them being
+    // used by another subgraph, or being an output of the full graph.
+    tensor_ref_count[tensor_index] += 2;
   }
 
   // All output tensors are allocated by TensorFlow/Eager, so we
@@ -263,11 +473,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   }
 
   for (const auto& node_data : op_data->nodes) {
-    for (int tensor_index : node_data.inputs) {
-      ++tensor_ref_count[tensor_index];
+    if (node_data->nodedef().op().empty()) {
+      context->ReportError(context, "Invalid NodeDef in Flex op '%s'",
+                           node_data->name().c_str());
+      return kTfLiteError;
+    }
+    TF_LITE_ENSURE(context, node_data->op());
+
+    for (int i = 0; i < node_data->inputs().Size(); ++i) {
+      ++tensor_ref_count[node_data->inputs().TfLiteIndex(i)];
     }
   }
 
+  buffer_map->ClearForwardable();
   for (const auto& x : tensor_ref_count) {
     if (x.second == 1) {
       // This tensor is referenced once by a single op. We can allow the TF
@@ -281,9 +499,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const auto* op_data = reinterpret_cast<OpData*>(node->user_data);
+  auto* op_data = reinterpret_cast<OpData*>(node->user_data);
   BufferMap* buffer_map = op_data->buffer_map;
-  tensorflow::EagerContext* eager_context = op_data->eager_context;
 
   // Insert a tensor in the buffer map for all inputs that are not constant.
   // Constants were handled in Prepare() already.
@@ -300,18 +517,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   }
 
   // Execute the TensorFlow Ops sequentially.
-  for (const auto& node_data : op_data->nodes) {
+  for (auto& node_data : op_data->nodes) {
     SCOPED_TAGGED_OPERATOR_PROFILE(
         reinterpret_cast<profiling::Profiler*>(context->profiler),
-        node_data.name.c_str(), node_data.index);
-    if (node_data.nodedef.op().empty()) {
-      context->ReportError(context, "Invalid NodeDef in Flex op '%s'",
-                           node_data.name.c_str());
-      return kTfLiteError;
-    }
-    auto status =
-        ExecuteFlexOp(eager_context, buffer_map, node_data.name,
-                      node_data.nodedef, node_data.inputs, node_data.outputs);
+        node_data->name().c_str(), node_data->index());
+
+    auto status = ExecuteFlexOp(context, buffer_map, node_data.get());
     TF_LITE_ENSURE_OK(context, ConvertStatus(context, status));
   }
 
@@ -330,13 +541,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     tensor->data_is_stale = true;
   }
 
-  // We don't need to keep track of internal TF tensors any longer, so take
-  // them out of the buffer_map, but make sure we keep all the one we might
-  // need for other subgraphs, or as final output of inference.
-  const auto& outputs = op_data->subgraph_outputs;
-  std::set<int> keep(outputs.begin(), outputs.end());
-  buffer_map->RemoveTensorsNotInSet(keep);
-
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/delegates/flex/kernel_test.cc b/tensorflow/lite/delegates/flex/kernel_test.cc
index cc5c8b32a015b710ac55b1466d0ff27c128f64d5..5b3a6d164707a805f05765764b13d2d01eac967f 100644
--- a/tensorflow/lite/delegates/flex/kernel_test.cc
+++ b/tensorflow/lite/delegates/flex/kernel_test.cc
@@ -25,6 +25,7 @@ namespace {
 
 using ::testing::ContainsRegex;
 using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
 
 TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteDelegate* delegate,
                             const std::vector<int>& supported_nodes) {
@@ -36,13 +37,38 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteDelegate* delegate,
   return kTfLiteOk;
 }
 
+// There is no easy way to pass a parameter into the TfLiteDelegate's
+// 'prepare' function, so we keep a global map for testing purpused.
+// To avoid collisions use: GetPrepareFunction<__LINE__>().
+std::map<int, std::vector<int>>* GetGlobalOpLists() {
+  static auto* op_list = new std::map<int, std::vector<int>>;
+  return op_list;
+}
+
 class KernelTest : public testing::FlexModelTest {
  public:
+  static constexpr int kOnes = 1;  // This is the index of a tensor of 1's.
+  static constexpr int kTwos = 2;  // This is the index of a tensor of 2's.
+  static constexpr int kMaxTensors = 30;
+
+  static void SetUpTestSuite() { GetGlobalOpLists()->clear(); }
+
   KernelTest() {
     CHECK(delegate_data_.Prepare(tensorflow::SessionOptions{}).ok());
     interpreter_.reset(new Interpreter(&error_reporter_));
   }
 
+  typedef TfLiteStatus (*PrepareFunction)(TfLiteContext* context,
+                                          TfLiteDelegate* delegate);
+
+  template <int KEY>
+  PrepareFunction GetPrepareFunction() {
+    GetGlobalOpLists()->insert({KEY, tf_ops_});
+    return [](TfLiteContext* context, TfLiteDelegate* delegate) {
+      return GenericPrepare(context, delegate, GetGlobalOpLists()->at(KEY));
+    };
+  }
+
   template <typename T>
   void ConfigureDelegate(T prepare_function) {
     delegate_.data_ = &delegate_data_;
@@ -54,9 +80,13 @@ class KernelTest : public testing::FlexModelTest {
                                         TfLiteBufferHandle buffer_handle,
                                         TfLiteTensor* output) {
       auto* delegate_data = reinterpret_cast<DelegateData*>(delegate->data_);
-      tensorflow::StringPiece values = delegate_data->GetBufferMap(context)
-                                           ->GetTensor(buffer_handle)
-                                           .tensor_data();
+      auto* buffer_map = delegate_data->GetBufferMap(context);
+      if (!buffer_map->HasTensor(buffer_handle)) {
+        context->ReportError(context, "Tensor '%d' not found", buffer_handle);
+        return kTfLiteError;
+      }
+      tensorflow::StringPiece values =
+          buffer_map->GetTensor(buffer_handle).tensor_data();
       memcpy(output->data.raw, values.data(), values.size());
       return kTfLiteOk;
     };
@@ -114,12 +144,9 @@ TEST_F(KernelTest, BadTensorFlowOp) {
     return GenericPrepare(context, delegate, {0});
   });
 
-  SetShape(0, {2, 2, 1});
-  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
-
-  ASSERT_FALSE(Invoke());
+  ASSERT_NE(interpreter_->AllocateTensors(), kTfLiteOk);
   ASSERT_THAT(error_reporter().error_messages(),
-              ContainsRegex("while processing attributes of 'NonExistentOp'"));
+              ContainsRegex("Op type not registered 'NonExistentOp'"));
 }
 
 TEST_F(KernelTest, BadNumberOfOutputs) {
@@ -166,10 +193,7 @@ TEST_F(KernelTest, WrongSetOfNodes) {
     return GenericPrepare(context, delegate, {0, 1});
   });
 
-  SetShape(0, {2, 2, 1});
-  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
-
-  ASSERT_FALSE(Invoke());
+  ASSERT_NE(interpreter_->AllocateTensors(), kTfLiteOk);
   ASSERT_THAT(error_reporter().error_messages(),
               ContainsRegex("Invalid NodeDef in Flex op"));
 }
@@ -228,7 +252,7 @@ TEST_F(KernelTest, SplitGraph) {
   AddTfOp(testing::kAdd, {9, 16}, {17});  // => 16
 
   ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
-    // All ops by #3 are TF ops, handled by the delegate. However, because #4
+    // All ops but #3 are TF ops, handled by the delegate. However, because #4
     // depends on the non-TF op, two subgraphs are necessary:
     //    TF subgraph 1: 0, 1, 2, 6, 7, 8, 9
     //    TF Lite Op: 3
@@ -263,6 +287,132 @@ TEST_F(KernelTest, SplitGraph) {
   ASSERT_THAT(GetValues(17), ElementsAre(18.0f));
 }
 
+class MultipleSubgraphsTest : public KernelTest {
+ public:
+  static constexpr int kInput = 0;
+
+  void PrepareInterpreter(PrepareFunction prepare,
+                          const std::vector<float>& input) {
+    ConfigureDelegate(prepare);
+
+    SetShape(kOnes, {3});
+    SetValues(kOnes, {1.0f, 1.0f, 1.0f});
+    SetShape(kTwos, {3});
+    SetValues(kTwos, {2.0f, 2.0f, 2.0f});
+
+    SetValues(kInput, input);
+  }
+
+  std::vector<float> Apply(const std::vector<float>& input,
+                           std::function<float(float)> function) {
+    std::vector<float> result;
+    for (float f : input) {
+      result.push_back(function(f));
+    }
+    return result;
+  }
+};
+
+TEST_F(MultipleSubgraphsTest, ForwardabilityIsLocal) {
+  AddTensors(kMaxTensors, {kInput, kOnes, kTwos}, {12}, kTfLiteFloat32, {3});
+
+  // Only TF tensors can be forwarded, so we build a small first graph
+  // to produce tensor #10. Here #10 is forwardable, because it is only
+  // used once, as an output.
+  AddTfOp(testing::kAdd, {0, kOnes}, {3});
+  AddTfOp(testing::kAdd, {0, kOnes}, {10});
+
+  // The second TF graph, separated from the former by a TF Lite
+  // multiplication, will consume tensor #10, which is not forwardable here
+  // since it is used by more than one op. The existing code will forward the
+  // tensor anyway, because it was deemed to be forwardable by the previous
+  // subgraph.
+  AddTfLiteMulOp({3, kTwos}, {4});
+  AddTfOp(testing::kAdd, {10, 4}, {11});
+  AddTfOp(testing::kAdd, {11, 10}, {7});
+
+  // And a simple TF Lite op trying to access tensor #10, which was removed
+  // from the buffer map. It will cause Invoke() to fail.
+  AddTfLiteMulOp({10, 7}, {12});
+
+  auto input = {3.0f, 4.0f, 5.0f};
+  PrepareInterpreter(GetPrepareFunction<__LINE__>(), input);
+
+  ASSERT_TRUE(Invoke());
+  ASSERT_THAT(GetValues(12), ElementsAreArray(Apply(input, [](float in) {
+                return (4 * in + 4) * (in + 1);
+              })));
+}
+
+// Subgraphs should not remove input tensors from the buffer_map, since
+// they could be necessary for downstream graphs.
+TEST_F(MultipleSubgraphsTest, DoNotRemoveInputTensors) {
+  AddTensors(kMaxTensors, {kInput, kOnes, kTwos}, {12}, kTfLiteFloat32, {3});
+
+  // Only TF tensors can be removed, so we build a small first graph
+  // to produce tensor #10. We make sure it is used by more than one
+  // op, so it is not forwardable here.
+  AddTfOp(testing::kAdd, {0, kOnes}, {3});
+  AddTfOp(testing::kAdd, {0, kOnes}, {10});
+  AddTfOp(testing::kAdd, {10, kOnes}, {15});
+  AddTfOp(testing::kAdd, {10, kOnes}, {16});
+
+  // The second TF graph, separated from the former by a TF Lite
+  // multiplication, will consume tensor #10. The existing code will remove
+  // from the buffer_map all tensors that are not outputs, so #10 will
+  // disappear. Note that we are using #10 in two ops, so it is not forwardable
+  // either.
+  AddTfLiteMulOp({3, kTwos}, {4});
+  AddTfOp(testing::kAdd, {10, 4}, {11});
+  AddTfOp(testing::kAdd, {10, 11}, {7});
+
+  // And a simple TF Lite op trying to access tensor #10, which was removed
+  // from the buffer map. It will cause Invoke() to fail.
+  AddTfLiteMulOp({10, 7}, {12});
+
+  auto input = {3.0f, 4.0f, 5.0f};
+  PrepareInterpreter(GetPrepareFunction<__LINE__>(), input);
+
+  ASSERT_TRUE(Invoke());
+  ASSERT_THAT(GetValues(12), ElementsAreArray(Apply(input, [](float in) {
+                return (4 * in + 4) * (in + 1);
+              })));
+}
+
+// A tensor is deemed forwardable but it happens to be the input to
+// more than one subgraph. It should not be forwarded, otherwise its
+// contents will be overwritten.
+TEST_F(MultipleSubgraphsTest, DoNotForwardInputTensors) {
+  AddTensors(kMaxTensors, {kInput, kOnes, kTwos}, {12}, kTfLiteFloat32, {3});
+
+  // Only TF tensors can be forwarded, so we build a small first graph
+  // to produce tensor #10.
+  AddTfOp(testing::kAdd, {0, kOnes}, {3});
+  AddTfOp(testing::kAdd, {0, kOnes}, {10});
+
+  // The second TF graph, separated from the former by a TF Lite
+  // multiplication, will consume tensor #10 and will think it is forwardable
+  // because it is used by a single op. However, the subgraph doesn't have
+  // enough information to make that judgment, as the input tensor could be
+  // used by another graph further downstream. The existing code will forward
+  // the tensor and remove it from the buffer_map, causing a failure later.
+  AddTfLiteMulOp({3, kTwos}, {4});
+  AddTfOp(testing::kAdd, {10, 4}, {11});
+  AddTfOp(testing::kAdd, {11, 4}, {7});
+
+  // And a simple TF Lite op trying to access tensor #10, which was removed
+  // from the buffer map. It will cause Invoke() to fail.
+  AddTfLiteMulOp({10, 7}, {12});
+
+  auto input = {3.0f, 4.0f, 5.0f};
+  PrepareInterpreter(GetPrepareFunction<__LINE__>(), input);
+
+  ASSERT_TRUE(Invoke());
+  ASSERT_THAT(GetValues(12), ElementsAreArray(Apply(input, [](float in) {
+                return (5 * in + 5) * (in + 1);
+              })));
+}
+
 }  // namespace
 }  // namespace flex
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/flex/test_util.cc b/tensorflow/lite/delegates/flex/test_util.cc
index aa24675a7b1beab8632435debc8dd1fc04f347e7..a67aeef231b497de2b4749b2ce2fdd5edd5c6129 100644
--- a/tensorflow/lite/delegates/flex/test_util.cc
+++ b/tensorflow/lite/delegates/flex/test_util.cc
@@ -90,6 +90,8 @@ void FlexModelTest::AddTensors(int num_tensors, const std::vector<int>& inputs,
 
 void FlexModelTest::AddTfLiteMulOp(const std::vector<int>& inputs,
                                    const std::vector<int>& outputs) {
+  ++next_op_index_;
+
   static TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
   reg.builtin_code = BuiltinOperator_MUL;
   reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
@@ -114,6 +116,9 @@ void FlexModelTest::AddTfLiteMulOp(const std::vector<int>& inputs,
 
 void FlexModelTest::AddTfOp(TfOpType op, const std::vector<int>& inputs,
                             const std::vector<int>& outputs) {
+  tf_ops_.push_back(next_op_index_);
+  ++next_op_index_;
+
   auto attr = [](const string& key, const string& value) {
     return " attr{ key: '" + key + "' value {" + value + "}}";
   };
diff --git a/tensorflow/lite/delegates/flex/test_util.h b/tensorflow/lite/delegates/flex/test_util.h
index 2cc2dc30e92586535687187105057d41ab5c0350..1913a406e8388af30ff5ca88f18f03fb75d46c49 100644
--- a/tensorflow/lite/delegates/flex/test_util.h
+++ b/tensorflow/lite/delegates/flex/test_util.h
@@ -103,6 +103,7 @@ class FlexModelTest : public ::testing::Test {
  protected:
   std::unique_ptr<Interpreter> interpreter_;
   TestErrorReporter error_reporter_;
+  std::vector<int> tf_ops_;
 
  private:
   // Helper method to add a TensorFlow op. tflite_names needs to start with
@@ -112,6 +113,8 @@ class FlexModelTest : public ::testing::Test {
                const std::vector<int>& outputs);
 
   std::vector<std::vector<uint8_t>> flexbuffers_;
+
+  int next_op_index_ = 0;
 };
 
 }  // namespace testing
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index fd954ba222627ab0457711b87baf9c3f7573e129..dda3c025677bc0b36239b23ef3152ed30a924cac 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -3,6 +3,7 @@ package(default_visibility = [
 ])
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 licenses(["notice"])  # Apache 2.0
 
@@ -23,7 +24,7 @@ tf_cc_test(
     name = "nnapi_delegate_test",
     size = "small",
     srcs = ["nnapi_delegate_test.cc"],
-    tags = ["no_oss"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":nnapi_delegate",
         "//tensorflow/lite:framework",
@@ -32,3 +33,5 @@ tf_cc_test(
         "@com_google_googletest//:gtest",
     ],
 )
+
+tflite_portable_test_suite()
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 7908bbf1641fcf07408b9380fb1587768d9f233c..8af159e6fb7b8b4fcd3272cd61a481b85907671a 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -37,7 +37,7 @@ namespace {
 
 // TODO(b/80621585): Consider printing error string, but don't for now to
 // minimize binary size.
-#define CHECK_NN(context, code)                                               \
+#define RETURN_TFLITE_ERROR_IF_NN_ERROR(context, code)                        \
   do {                                                                        \
     const auto _code = (code);                                                \
     if (_code != ANEURALNETWORKS_NO_ERROR) {                                  \
@@ -223,8 +223,8 @@ class NNAPIOpBuilder {
         .type = ANEURALNETWORKS_TENSOR_FLOAT32,
         .dimensionCount = dimension_count,
         .dimensions = dims.data()};
-    CHECK_NN(context_,
-             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_, ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
     int ann_operand = operand_mapping_->add_new_non_tensor_operand();
     augmented_outputs_.push_back(ann_operand);
     return kTfLiteOk;
@@ -240,8 +240,8 @@ class NNAPIOpBuilder {
         static_cast<uint32_t>(tensor->dims->size),
         reinterpret_cast<uint32_t*>(tensor->dims->data), tensor->params.scale,
         tensor->params.zero_point};
-    CHECK_NN(context_,
-             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_, ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
     augmented_outputs_.push_back(ann_index);
 
     *ann_tensor_index_out = ann_index;
@@ -297,14 +297,15 @@ class NNAPIOpBuilder {
     ANeuralNetworksOperandType operand_type{
         nn_type, static_cast<uint32_t>(tensor->dims->size),
         reinterpret_cast<uint32_t*>(tensor->dims->data), scale, zeroPoint};
-    CHECK_NN(context_,
-             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_, ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
 
     if (tensor->allocation_type == kTfLiteMmapRo) {
       // TODO(b/80630405): Use NNAPIAllocation.
-      CHECK_NN(context_, ANeuralNetworksModel_setOperandValue(
-                             nn_model_, ann_tensor_index, tensor->data.raw,
-                             tensor->bytes));
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context_,
+          ANeuralNetworksModel_setOperandValue(
+              nn_model_, ann_tensor_index, tensor->data.raw, tensor->bytes));
     }
 
     *ann_tensor_index_out = ann_tensor_index;
@@ -314,12 +315,13 @@ class NNAPIOpBuilder {
   // Finish emitting the op (of type `type`) into the NN API.
   TfLiteStatus FinalizeAddOperation(ANeuralNetworksOperationType type) {
     // Actually add a NN API operation
-    CHECK_NN(context_, ANeuralNetworksModel_addOperation(
-                           nn_model_, type,
-                           static_cast<uint32_t>(augmented_inputs_.size()),
-                           augmented_inputs_.data(),
-                           static_cast<uint32_t>(augmented_outputs_.size()),
-                           augmented_outputs_.data()));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_,
+        ANeuralNetworksModel_addOperation(
+            nn_model_, type, static_cast<uint32_t>(augmented_inputs_.size()),
+            augmented_inputs_.data(),
+            static_cast<uint32_t>(augmented_outputs_.size()),
+            augmented_outputs_.data()));
     augmented_inputs_.clear();
     augmented_outputs_.clear();
     return kTfLiteOk;
@@ -329,11 +331,12 @@ class NNAPIOpBuilder {
   template <typename T>
   TfLiteStatus AddScalarOperand(T value, int32_t nn_type) {
     ANeuralNetworksOperandType operand_type{.type = nn_type};
-    CHECK_NN(context_,
-             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_, ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
     int ann_operand = operand_mapping_->add_new_non_tensor_operand();
-    CHECK_NN(context_, ANeuralNetworksModel_setOperandValue(
-                           nn_model_, ann_operand, &value, sizeof(T)));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_, ANeuralNetworksModel_setOperandValue(nn_model_, ann_operand,
+                                                       &value, sizeof(T)));
     augmented_inputs_.push_back(ann_operand);
     return kTfLiteOk;
   }
@@ -343,12 +346,12 @@ class NNAPIOpBuilder {
                                 int32_t nn_type) {
     ANeuralNetworksOperandType operand_type{
         .type = nn_type, .dimensionCount = 1, .dimensions = &num_values};
-    CHECK_NN(context_,
-             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_, ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
     int ann_operand = operand_mapping_->add_new_non_tensor_operand();
-    CHECK_NN(context_,
-             ANeuralNetworksModel_setOperandValue(
-                 nn_model_, ann_operand, values, sizeof(T) * num_values));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_, ANeuralNetworksModel_setOperandValue(
+                      nn_model_, ann_operand, values, sizeof(T) * num_values));
     augmented_inputs_.push_back(ann_operand);
     return kTfLiteOk;
   }
@@ -833,7 +836,8 @@ class NNAPIDelegateKernel {
 
     if (!nn_model_) {
       ANeuralNetworksModel* model;
-      CHECK_NN(context, ANeuralNetworksModel_create(&model));
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(context,
+                                      ANeuralNetworksModel_create(&model));
       nn_model_.reset(model);
 
       TF_LITE_ENSURE_STATUS(
@@ -842,9 +846,11 @@ class NNAPIDelegateKernel {
 
     if (!nn_compilation_) {
       ANeuralNetworksCompilation* compilation;
-      CHECK_NN(context, ANeuralNetworksCompilation_create(nn_model_.get(),
-                                                          &compilation));
-      CHECK_NN(context, ANeuralNetworksCompilation_finish(compilation));
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context,
+          ANeuralNetworksCompilation_create(nn_model_.get(), &compilation));
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context, ANeuralNetworksCompilation_finish(compilation));
       nn_compilation_.reset(compilation);
     }
     return kTfLiteOk;
@@ -852,8 +858,9 @@ class NNAPIDelegateKernel {
 
   TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node) {
     ANeuralNetworksExecution* execution = nullptr;
-    CHECK_NN(context, ANeuralNetworksExecution_create(nn_compilation_.get(),
-                                                      &execution));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context,
+        ANeuralNetworksExecution_create(nn_compilation_.get(), &execution));
 
     // Set the input tensor buffers. Note: we access tflite tensors using
     // absolute indices but NN api indices inputs by relative indices.
@@ -871,10 +878,11 @@ class NNAPIDelegateKernel {
         // copy data to pre-allocated shared memory.
         memcpy(nn_input_memory_->get_data_ptr() + input_offset,
                tensor->data.raw, tensor->bytes);
-        CHECK_NN(context, ANeuralNetworksExecution_setInputFromMemory(
-                              execution, relative_input_index, nullptr,
-                              nn_input_memory_->get_handle(), input_offset,
-                              tensor->bytes));
+        RETURN_TFLITE_ERROR_IF_NN_ERROR(
+            context,
+            ANeuralNetworksExecution_setInputFromMemory(
+                execution, relative_input_index, nullptr,
+                nn_input_memory_->get_handle(), input_offset, tensor->bytes));
         input_offset += tensor->bytes;
         relative_input_index++;
       }
@@ -885,10 +893,11 @@ class NNAPIDelegateKernel {
     size_t output_offset = 0;
     for (auto output_index : TfLiteIntArrayView(node->outputs)) {
       TfLiteTensor* tensor = &context->tensors[output_index];
-      CHECK_NN(context, ANeuralNetworksExecution_setOutputFromMemory(
-                            execution, relative_output_index, nullptr,
-                            nn_output_memory_->get_handle(), output_offset,
-                            tensor->bytes));
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context,
+          ANeuralNetworksExecution_setOutputFromMemory(
+              execution, relative_output_index, nullptr,
+              nn_output_memory_->get_handle(), output_offset, tensor->bytes));
       output_offset += tensor->bytes;
       relative_output_index++;
     }
@@ -901,15 +910,17 @@ class NNAPIDelegateKernel {
       // Here we are using a deep copy for state_in tensors so that we are not
       // reading and writing into the same buffer during a invocation.
       // TODO(110369471): using double shared buffer to minimize the copies.
-      CHECK_NN(context, ANeuralNetworksExecution_setOutput(
-                            execution, relative_output_index, nullptr,
-                            tensor->data.raw, tensor->bytes));
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context, ANeuralNetworksExecution_setOutput(
+                       execution, relative_output_index, nullptr,
+                       tensor->data.raw, tensor->bytes));
       relative_output_index++;
     }
     // Invoke ANN in blocking fashion.
     ANeuralNetworksEvent* event = nullptr;
-    CHECK_NN(context, ANeuralNetworksExecution_startCompute(execution, &event));
-    CHECK_NN(context, ANeuralNetworksEvent_wait(event));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context, ANeuralNetworksExecution_startCompute(execution, &event));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(context, ANeuralNetworksEvent_wait(event));
     ANeuralNetworksEvent_free(event);
     ANeuralNetworksExecution_free(execution);
 
@@ -1016,19 +1027,21 @@ class NNAPIDelegateKernel {
     }
 
     // Tell ANN to declare inputs/outputs
-    CHECK_NN(context, ANeuralNetworksModel_identifyInputsAndOutputs(
-                          nn_model_.get(), inputs.size(), inputs.data(),
-                          outputs.size(), outputs.data()));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context, ANeuralNetworksModel_identifyInputsAndOutputs(
+                     nn_model_.get(), inputs.size(), inputs.data(),
+                     outputs.size(), outputs.data()));
 
     // Set relaxed computation mode for fp32 if possible.
     if (kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
-      CHECK_NN(context,
-               ANeuralNetworksModel_relaxComputationFloat32toFloat16(
-                   nn_model_.get(), context->allow_fp32_relax_to_fp16));
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context, ANeuralNetworksModel_relaxComputationFloat32toFloat16(
+                       nn_model_.get(), context->allow_fp32_relax_to_fp16));
     }
 
     // Finalize the model
-    CHECK_NN(context, ANeuralNetworksModel_finish(nn_model_.get()));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context, ANeuralNetworksModel_finish(nn_model_.get()));
 
     // Create shared memory pool for inputs and outputs.
     nn_input_memory_.reset(new NNMemory("input_pool", total_input_byte_size));
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
index ceba8c735cfd434fa823f8c5d3ea635bb40257fa..5da052eb42275d684bfbf83e7b52227ccbb97a06 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -27,6 +27,16 @@ using ::testing::ElementsAreArray;
 // TODO(b/110368244): figure out how to share the existing tests in kernels/ but
 // with the delegation on. Also, add more unit tests to improve code coverage.
 
+// This matcher uses 1 as maximum tolerance.
+MATCHER(QuantizedNear, "") {
+  const int diff = abs(std::get<0>(arg) - std::get<1>(arg));
+  if (diff > 1) {
+    *result_listener << "Quantized values can be at most off by one: " << diff;
+    return false;
+  }
+  return true;
+}
+
 class SingleOpModelWithNNAPI : public SingleOpModel {
  public:
   SingleOpModelWithNNAPI() {
@@ -585,14 +595,14 @@ class ReshapeOpModel : public SingleOpModelWithNNAPI {
   ReshapeOpModel(std::initializer_list<int> input_shape,
                  std::initializer_list<int> new_shape) {
     input_ = AddInput(TensorType_FLOAT32);
-    new_shape_ = AddInput(TensorType_INT32);
+    new_shape_ = AddConstInput<int>(TensorType_INT32, new_shape,
+                                    {static_cast<int>(new_shape.size())});
     output_ = AddOutput(TensorType_FLOAT32);
     SetBuiltinOp(
         BuiltinOperator_RESHAPE, BuiltinOptions_ReshapeOptions,
         CreateReshapeOptions(builder_, builder_.CreateVector<int>(new_shape))
             .Union());
     BuildInterpreter({input_shape, {static_cast<int>(new_shape.size())}});
-    PopulateTensor<int>(new_shape_, new_shape);
   }
 
   void SetInput(std::initializer_list<float> data) {
@@ -1326,7 +1336,8 @@ TEST(NNAPIDelegate, LogisticQuantized) {
                   },
                   kQuantizedTolerance)));
   EXPECT_THAT(m.GetOutput<uint8_t>(),
-              ElementsAreArray({128, 1, 227, 251, 244, 32, 255, 188}));
+              testing::Pointwise(QuantizedNear(),
+                                 {128, 1, 227, 251, 244, 32, 255, 188}));
 }
 
 #if 0
@@ -1576,14 +1587,17 @@ class StridedSliceOpModel : public SingleOpModelWithNNAPI {
  public:
   StridedSliceOpModel(std::initializer_list<int> input_shape,
                       std::initializer_list<int> begin_shape,
+                      std::initializer_list<int> begin_data,
                       std::initializer_list<int> end_shape,
-                      std::initializer_list<int> strides_shape, int begin_mask,
+                      std::initializer_list<int> end_data,
+                      std::initializer_list<int> strides_shape,
+                      std::initializer_list<int> strides_data, int begin_mask,
                       int end_mask, int ellipsis_mask, int new_axis_mask,
                       int shrink_axis_mask) {
     input_ = AddInput(tensor_input_type);
-    begin_ = AddInput(TensorType_INT32);
-    end_ = AddInput(TensorType_INT32);
-    strides_ = AddInput(TensorType_INT32);
+    begin_ = AddConstInput(TensorType_INT32, begin_data, begin_shape);
+    end_ = AddConstInput(TensorType_INT32, end_data, end_shape);
+    strides_ = AddConstInput(TensorType_INT32, strides_data, strides_shape);
     output_ = AddOutput(tensor_input_type);
     SetBuiltinOp(
         BuiltinOperator_STRIDED_SLICE, BuiltinOptions_StridedSliceOptions,
@@ -1596,15 +1610,6 @@ class StridedSliceOpModel : public SingleOpModelWithNNAPI {
   void SetInput(std::initializer_list<input_type> data) {
     PopulateTensor<input_type>(input_, data);
   }
-  void SetBegin(std::initializer_list<int32_t> data) {
-    PopulateTensor<int32_t>(begin_, data);
-  }
-  void SetEnd(std::initializer_list<int32_t> data) {
-    PopulateTensor<int32_t>(end_, data);
-  }
-  void SetStrides(std::initializer_list<int32_t> data) {
-    PopulateTensor<int32_t>(strides_, data);
-  }
 
   std::vector<input_type> GetOutput() {
     return ExtractVector<input_type>(output_);
@@ -1619,39 +1624,47 @@ class StridedSliceOpModel : public SingleOpModelWithNNAPI {
   int output_;
 };
 
-TEST(NNAPIDelegate, StridedSliceIn2D) {
-  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
-  m.SetInput({1, 2, 3, 4, 5, 6});
-  m.SetBegin({1, 0});
-  m.SetEnd({2, 2});
-  m.SetStrides({1, 1});
+TEST(StridedSliceOpTest, In1D) {
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, {3}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({4, 5}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 3}));
 }
 
-TEST(NNAPIDelegate, StridedSliceIn2D_ShrinkAxis_NegativeSlice) {
-  // This is equivalent to tf.range(4)[:, tf.newaxis][-2, -1].
-  StridedSliceOpModel<> m({4, 1}, {2}, {2}, {2}, 0, 0, 0, 0, 3);
-  m.SetInput({0, 1, 2, 3});
-  m.SetBegin({-2, -1});
-  m.SetEnd({-1, 0});
-  m.SetStrides({1, 1});
+TEST(StridedSliceOpTest, In1D_BeginMask) {
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, {3}, {1}, {1}, 1, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3}));
+}
 
+TEST(StridedSliceOpTest, In2D_Stride2) {
+  StridedSliceOpModel<> m({2, 3}, {2}, {0, 0}, {2}, {2, 3}, {2}, {2, 2}, 0, 0,
+                          0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6});
   m.Invoke();
-  EXPECT_TRUE(m.GetOutputShape().empty());
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3}));
 }
 
-TEST(NNAPIDelegate, StridedSliceIn2D_ShrinkAxisMask) {
-  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 3);
+TEST(StridedSliceOpTest, In2D_EndMask) {
+  StridedSliceOpModel<> m({2, 3}, {2}, {1, 0}, {2}, {2, 2}, {2}, {1, 1}, 0, 2,
+                          0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
-  m.SetBegin({0, 0});
-  m.SetEnd({1, 1});
-  m.SetStrides({1, 1});
   m.Invoke();
-  EXPECT_TRUE(m.GetOutputShape().empty());
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({4, 5, 6}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis4) {
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {0, 0, 0}, {3}, {2, 3, 1}, {3},
+                          {1, 1, 1}, 0, 0, 0, 0, 4);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 5, 7, 9, 11}));
 }
 
 static float rnn_input[] = {
@@ -1990,7 +2003,9 @@ class BaseSVDFOpModel : public SingleOpModelWithNNAPI {
     input_ = AddInput(TensorType_FLOAT32);
     weights_feature_ = AddInput(weights_feature_type);
     weights_time_ = AddInput(weights_time_type);
-    bias_ = AddNullInput();
+    // TODO(b/121383394) : figure out why optional bias causes TFLite segfault
+    // when using NNAPI delegate.
+    bias_ = AddInput(TensorType_FLOAT32);
     const int num_filters = units * rank;
     activation_state_ = AddInput(
         TensorData{TensorType_FLOAT32, {batches, memory_size * num_filters}},
@@ -2006,6 +2021,8 @@ class BaseSVDFOpModel : public SingleOpModelWithNNAPI {
         {units_},                             // bias tensor
         {batches, memory_size * num_filters}  // activation_state tensor
     });
+    // TODO(b/121383394) : remove once the optional bias bug is fixed.
+    PopulateTensor(bias_, std::vector<float>(units_));
   }
 
   // Populates the weights_feature tensor.
diff --git a/tensorflow/lite/examples/android/app/build.gradle b/tensorflow/lite/examples/android/app/build.gradle
index e5f5c7efd13b396161218294905857df479e5c3b..b372afae190ded84947b45655018a78633715c16 100644
--- a/tensorflow/lite/examples/android/app/build.gradle
+++ b/tensorflow/lite/examples/android/app/build.gradle
@@ -2,7 +2,7 @@ apply plugin: 'com.android.application'
 
 android {
     compileSdkVersion 26
-    buildToolsVersion '26.0.2'
+    buildToolsVersion '27.0.3'
     defaultConfig {
         applicationId "org.tensorflow.lite.demo"
         minSdkVersion 15
@@ -45,6 +45,6 @@ project.ext.TMP_DIR   = project.buildDir.toString() + '/downloads'
 apply from: "download-models.gradle"
 
 dependencies {
-    compile fileTree(dir: 'libs', include: ['*.jar'])
-    compile 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
+    implementation fileTree(dir: 'libs', include: ['*.jar'])
+    implementation 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
 }
diff --git a/tensorflow/lite/examples/android/build.gradle b/tensorflow/lite/examples/android/build.gradle
index 7c79358e45937e5f1cb061fc24e7de603b964885..7c038ddd46418b6498251068a284e8ffcdeda96f 100644
--- a/tensorflow/lite/examples/android/build.gradle
+++ b/tensorflow/lite/examples/android/build.gradle
@@ -6,7 +6,7 @@ buildscript {
         jcenter()
     }
     dependencies {
-        classpath 'com.android.tools.build:gradle:3.0.1'
+        classpath 'com.android.tools.build:gradle:3.1.4'
 
         // NOTE: Do not place your application dependencies here; they belong
         // in the individual module build.gradle files
diff --git a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
index 4d5ea40cd05696f6853e7aee5f601a42a8947c90..4f6fcaa96c4b917b79dacc5180594c1458ef18ff 100644
--- a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
+++ b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
@@ -346,7 +346,15 @@ void ProcessInputWithQuantizedModel(
   NSLog(@"Time: %.4lf, avg: %.4lf, count: %d", end - start, total_latency / total_count,
         total_count);
 
-  const int output_size = 1000;
+  // read output size from the output sensor
+  const int output_tensor_index = interpreter->outputs()[0];
+  TfLiteTensor* output_tensor = interpreter->tensor(output_tensor_index);
+  TfLiteIntArray* output_dims = output_tensor->dims;
+  if (output_dims->size != 2 || output_dims->data[0] != 1) {
+    LOG(FATAL) << "Output of the model is in invalid format.";
+  }
+  const int output_size = output_dims->data[1];
+
   const int kNumResults = 5;
   const float kThreshold = 0.1f;
 
diff --git a/tensorflow/lite/examples/label_image/label_image.md b/tensorflow/lite/examples/label_image/label_image.md
index fd9f49918b4494eab845da7716a350ad6246f532..178f5b9d3012206571b6fcf8af1d2416df9a42e5 100644
--- a/tensorflow/lite/examples/label_image/label_image.md
+++ b/tensorflow/lite/examples/label_image/label_image.md
@@ -40,7 +40,7 @@ To run it. Prepare `./mobilenet_quant_v1_224.tflite`, `./grace_hopper.bmp`, and
 
 Run it:
 ```
-> ./label_image                                        
+> ./label_image
 Loaded model ./mobilenet_quant_v1_224.tflite
 resolved reporter
 invoked
@@ -51,9 +51,9 @@ average time: 100.986 ms
 0.0235294: 514 cornet
 0.0196078: 835 suit
 ```
-Run `interpreter->Invoker()` 100 times:
+Run `interpreter->Invoke()` 100 times:
 ```
-> ./label_image   -c 100                               
+> ./label_image   -c 100
 Loaded model ./mobilenet_quant_v1_224.tflite
 resolved reporter
 invoked
diff --git a/tensorflow/lite/experimental/micro/BUILD b/tensorflow/lite/experimental/micro/BUILD
index e11159868e11a09e1b10d59da274cd08ee472593..2d00ef76f4a3e5360e45f31ee486e0b8a7c74cc3 100644
--- a/tensorflow/lite/experimental/micro/BUILD
+++ b/tensorflow/lite/experimental/micro/BUILD
@@ -12,6 +12,8 @@ load(
 cc_library(
     name = "micro_framework",
     srcs = [
+        "debug_log.cc",
+        "debug_log_numbers.cc",
         "micro_error_reporter.cc",
         "micro_interpreter.cc",
         "micro_mutable_op_resolver.cc",
@@ -19,6 +21,8 @@ cc_library(
     ],
     hdrs = [
         "compatibility.h",
+        "debug_log.h",
+        "debug_log_numbers.h",
         "micro_error_reporter.h",
         "micro_interpreter.h",
         "micro_mutable_op_resolver.h",
diff --git a/tensorflow/lite/experimental/micro/README.md b/tensorflow/lite/experimental/micro/README.md
index 97bc093b06d1a5def76631ed55657270b8d39bf6..931c8d9d315c26f7d59821ecf74b94982d795efa 100644
--- a/tensorflow/lite/experimental/micro/README.md
+++ b/tensorflow/lite/experimental/micro/README.md
@@ -1,46 +1,142 @@
 # TensorFlow Lite for Microcontrollers
 
-This an experimental port of TensorFlow Lite aimed at micro controllers and other devices with only kilobytes of memory. It doesn't require any operating system support, any standard C or C++ libraries, or dynamic memory allocation, so it's designed to be portable even to 'bare metal' systems. The core runtime fits in 16KB on a Cortex M3, and with enough operators to run a speech keyword detection model, takes up a total of 22KB.
+This an experimental port of TensorFlow Lite aimed at micro controllers and
+other devices with only kilobytes of memory. It doesn't require any operating
+system support, any standard C or C++ libraries, or dynamic memory allocation,
+so it's designed to be portable even to 'bare metal' systems. The core runtime
+fits in 16KB on a Cortex M3, and with enough operators to run a speech keyword
+detection model, takes up a total of 22KB.
 
 The design goals are for the framework to be:
 
-- **Readable**: We want embedded software engineers to be able to understand what's required to run ML inference without having to study research papers. We've tried to keep the code base small, modular, and have reference implementations of all operations to help with this.
-
-- **Easy to modify**: We know that there are a lot of different platforms and requirements in the embedded world, and we don't expect to cover all of them in one framework. Instead, we're hoping that it can be a good starting point for developers to build on top of to meet their own needs. For example, we tried to make it easy to replace the implementations of key computational operators that are often crucial for performance, without having to touch the data flow and other runtime code. We want it to make more sense to use our workflow to handle things like model import and less-important operations, and customize the parts that matter, rather than having to reimplement everything in your own engine.
-
-- **Well-tested**: If you're modifying code, you need to know if your changes are correct. Having an easy way to test lets you develop much faster. To help there, we've written tests for all the components, and we've made sure that the tests can be run on almost any platform, with no dependencies apart from the ability to log text to a debug console somewhere. We also provide an easy way to run all the tests on-device as part of an automated test framework, and we use qemu/Renode emulation so that tests can be run even without physical devices present.
-
-- **Easy to integrate**: We want to be as open a system as possible, and use the best code available for each platform. To do that, we're going to rely on projects like [CMSIS-NN](https://www.keil.com/pack/doc/CMSIS/NN/html/index.html), [uTensor](https://github.com/uTensor/uTensor), and other vendor libraries to handle as much performance-critical code as possible. We know that there are an increasing number of options to accelerate neural networks on microcontrollers, so we're aiming to be a good host for deploying those hardware technologies too.
-
-- **Compatible**: We're using the same file schema, interpreter API, and kernel interface as regular TensorFlow Lite, so we leverage the large existing set of tools, documentation, and examples for the project. The biggest barrier to deploying ML models is getting them from a training environment into a form that's easy to run inference on, so we see reusing this rich ecosystem as being crucial to being easily usable. We also hope to integrate this experimental work back into the main codebase in the future.
+-   **Readable**: We want embedded software engineers to be able to understand
+    what's required to run ML inference without having to study research papers.
+    We've tried to keep the code base small, modular, and have reference
+    implementations of all operations to help with this.
+
+-   **Easy to modify**: We know that there are a lot of different platforms and
+    requirements in the embedded world, and we don't expect to cover all of them
+    in one framework. Instead, we're hoping that it can be a good starting point
+    for developers to build on top of to meet their own needs. For example, we
+    tried to make it easy to replace the implementations of key computational
+    operators that are often crucial for performance, without having to touch
+    the data flow and other runtime code. We want it to make more sense to use
+    our workflow to handle things like model import and less-important
+    operations, and customize the parts that matter, rather than having to
+    reimplement everything in your own engine.
+
+-   **Well-tested**: If you're modifying code, you need to know if your changes
+    are correct. Having an easy way to test lets you develop much faster. To
+    help there, we've written tests for all the components, and we've made sure
+    that the tests can be run on almost any platform, with no dependencies apart
+    from the ability to log text to a debug console somewhere. We also provide
+    an easy way to run all the tests on-device as part of an automated test
+    framework, and we use qemu/Renode emulation so that tests can be run even
+    without physical devices present.
+
+-   **Easy to integrate**: We want to be as open a system as possible, and use
+    the best code available for each platform. To do that, we're going to rely
+    on projects like
+    [CMSIS-NN](https://www.keil.com/pack/doc/CMSIS/NN/html/index.html),
+    [uTensor](https://github.com/uTensor/uTensor), and other vendor libraries to
+    handle as much performance-critical code as possible. We know that there are
+    an increasing number of options to accelerate neural networks on
+    microcontrollers, so we're aiming to be a good host for deploying those
+    hardware technologies too.
+
+-   **Compatible**: We're using the same file schema, interpreter API, and
+    kernel interface as regular TensorFlow Lite, so we leverage the large
+    existing set of tools, documentation, and examples for the project. The
+    biggest barrier to deploying ML models is getting them from a training
+    environment into a form that's easy to run inference on, so we see reusing
+    this rich ecosystem as being crucial to being easily usable. We also hope to
+    integrate this experimental work back into the main codebase in the future.
 
 To meet those goals, we've made some tradeoffs:
 
-- **Simple C++**: To help with readability, our code is written in a modern version of C++, but we generally treat it as a "better C", rather relying on more complex features such as template meta-programming. As mentioned earlier, we avoid any use of dynamic memory allocation (new/delete) or the standard C/C++ libraries, so we believe this should still be fairly portable. It does mean that some older devices with C-only toolchains won't be supported, but we're hoping that the reference operator implementations (which are simple C-like functions) can still be useful in those cases. The interfaces are also designed to be C-only, so it should be possible to integrate the resulting library with pure C projects.
-
-- **Interpreted**: Code generation is a popular pattern for embedded code, because it gives standalone code that's easy to modify and step through, but we've chosen to go with an interpreted approach. In our internal microcontroller work we've found that using an extremely stripped-down interpreter with almost no dependencies gives us a lot of the same advantages, but is easier to maintain. For example, when new updates come out for the underlying library, you can just merge your local modifications in a single step, rather than having to regenerate new code and then patch in any changes you subsequently made. The coarse granularity of the interpreted primitives means that each operation call typically takes hundreds of thousands of instruction cycles at least, so we don't see noticeable performance gains from avoiding what's essentially a single switch statement at the interpreter level to call each operation. We're still working on improving the packaging though, for example we're considering having the ability to snapshot all the source files and headers used for a particular model, being able to compile the code and data together as a library, and then access it through a minimal set of C interface calls which hide the underlying complexity.
-
-- **Flatbuffers**: We represent our models using [the standard flatbuffer schema used by the rest of TensorFlow Lite](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema.fbs), with the difference that we always keep it in read-only program memory (typically flash) rather than relying on having a file system to read it from. This is a good fit because flatbuffer's serialized format is designed to be mapped into memory without requiring any extra memory allocations or modifications to access it. All of the functions to read model values work directly on the serialized bytes, and large sections of data like weights are directly accessible as sequential C-style arrays of their data type, with no strides or unpacking needed. We do get a lot of value from using flatbuffers, but there is a cost in complexity. The flat buffer library code is all inline [inside the main headers](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema_generated.h), but it isn't straightforward to inspect their implementations, and the model data structures aren't easy to comprehend from the debugger. The header for the schema itself also has to be periodically updated when new information is added to the file format, though we try to handle that transparently for most developers by checking in a pre-generated version.
-
-- **Code Duplication**: Some of the code in this prototype largely duplicates the logic in other parts of the TensorFlow Lite code base, for example the operator wrappers. We've tried to keep share as much as we can between the two interpreters, but there are some assumptions built into the original runtime that make this difficult. We'll be working on modularizing the main interpreter so that we can move to an entirely shared system.
-
-This initial preview release is designed to get early feedback, and is not intended to be a final product. It only includes enough operations to run a simple keyword recognition model, and the implementations are not optimized. We're hoping this will be a good way to get feedback and collaborate to improve the framework.
-
-## Getting Started
+-   **Simple C++**: To help with readability, our code is written in a modern
+    version of C++, but we generally treat it as a "better C", rather relying on
+    more complex features such as template meta-programming. As mentioned
+    earlier, we avoid any use of dynamic memory allocation (new/delete) or the
+    standard C/C++ libraries, so we believe this should still be fairly
+    portable. It does mean that some older devices with C-only toolchains won't
+    be supported, but we're hoping that the reference operator implementations
+    (which are simple C-like functions) can still be useful in those cases. The
+    interfaces are also designed to be C-only, so it should be possible to
+    integrate the resulting library with pure C projects.
+
+-   **Interpreted**: Code generation is a popular pattern for embedded code,
+    because it gives standalone code that's easy to modify and step through, but
+    we've chosen to go with an interpreted approach. In our internal
+    microcontroller work we've found that using an extremely stripped-down
+    interpreter with almost no dependencies gives us a lot of the same
+    advantages, but is easier to maintain. For example, when new updates come
+    out for the underlying library, you can just merge your local modifications
+    in a single step, rather than having to regenerate new code and then patch
+    in any changes you subsequently made. The coarse granularity of the
+    interpreted primitives means that each operation call typically takes
+    hundreds of thousands of instruction cycles at least, so we don't see
+    noticeable performance gains from avoiding what's essentially a single
+    switch statement at the interpreter level to call each operation. We're
+    still working on improving the packaging though, for example we're
+    considering having the ability to snapshot all the source files and headers
+    used for a particular model, being able to compile the code and data
+    together as a library, and then access it through a minimal set of C
+    interface calls which hide the underlying complexity.
+
+-   **Flatbuffers**: We represent our models using
+    [the standard flatbuffer schema used by the rest of TensorFlow Lite](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema.fbs),
+    with the difference that we always keep it in read-only program memory
+    (typically flash) rather than relying on having a file system to read it
+    from. This is a good fit because flatbuffer's serialized format is designed
+    to be mapped into memory without requiring any extra memory allocations or
+    modifications to access it. All of the functions to read model values work
+    directly on the serialized bytes, and large sections of data like weights
+    are directly accessible as sequential C-style arrays of their data type,
+    with no strides or unpacking needed. We do get a lot of value from using
+    flatbuffers, but there is a cost in complexity. The flat buffer library code
+    is all inline
+    [inside the main headers](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema_generated.h),
+    but it isn't straightforward to inspect their implementations, and the model
+    data structures aren't easy to comprehend from the debugger. The header for
+    the schema itself also has to be periodically updated when new information
+    is added to the file format, though we try to handle that transparently for
+    most developers by checking in a pre-generated version.
+
+-   **Code Duplication**: Some of the code in this prototype largely duplicates
+    the logic in other parts of the TensorFlow Lite code base, for example the
+    operator wrappers. We've tried to keep share as much as we can between the
+    two interpreters, but there are some assumptions built into the original
+    runtime that make this difficult. We'll be working on modularizing the main
+    interpreter so that we can move to an entirely shared system.
+
+This initial preview release is designed to get early feedback, and is not
+intended to be a final product. It only includes enough operations to run a
+simple keyword recognition model, and the implementations are not optimized.
+We're hoping this will be a good way to get feedback and collaborate to improve
+the framework.
+
+## Getting Started with Make
 
 Building requires a Linux or OS X machine.
 
- - Open a terminal
- - Download the TensorFlow source with `git clone https://github.com/tensorflow/tensorflow.git`
- - Enter the source root directory by running `cd tensorflow`
- - Download the dependencies by running `tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh`. This may take a few minutes
- - Build and test the library with `make -f tensorflow/lite/experimental/micro/tools/make/Makefile test`
+-   Open a terminal
+-   Download the TensorFlow source with `git clone
+    https://github.com/tensorflow/tensorflow.git`
+-   Enter the source root directory by running `cd tensorflow`
+-   Download the dependencies by running
+    `tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh`.
+    This may take a few minutes
+-   Build and test the library with `make -f
+    tensorflow/lite/experimental/micro/tools/make/Makefile test`
 
 You should see a series of compilation steps, followed by `~~~ALL TESTS
 PASSED~~~` for the various tests of the code that it will run. If there's an
 error, you should get an informative message from make about what went wrong.
 
-These tests are all built as simple binaries with few dependencies, so you can run them manually. For example, here's how to run the depthwise convolution test, and its output:
+These tests are all built as simple binaries with few dependencies, so you can
+run them manually. For example, here's how to run the depthwise convolution
+test, and its output:
 
 ```
 tensorflow/lite/experimental/micro/tools/make/gen/linux_x86_64/bin/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test
@@ -53,7 +149,9 @@ Testing SimpleTestReluQuantized
 ~ALL TESTS PASSED~~~
 ```
 
-Looking at the [depthwise_conv_test.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc) code, you'll see a sequence that looks like this:
+Looking at the
+[depthwise_conv_test.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc)
+code, you'll see a sequence that looks like this:
 
 ```
 ...
@@ -74,19 +172,41 @@ output, and the test harness that runs the binary during the make process knows
 that everything ran correctly. If there's an error, the lack of the expected
 string lets the harness know that the test failed.
 
-So, why are we running tests in this complicated way? So far, we've been building binaries that run locally on the Mac OS or Linux machine you're building on, but this approach becomes important when we're targeting simple micro controller devices.
+So, why are we running tests in this complicated way? So far, we've been
+building binaries that run locally on the Mac OS or Linux machine you're
+building on, but this approach becomes important when we're targeting simple
+micro controller devices.
 
 ## Building for the "Blue Pill" STM32F103
 
-The goal of this library is to enable machine learning on resource-constrained micro controllers and DSPs, and as part of that we've targeted the ["Blue Pill" STM32F103-compatible development board](https://github.com/google/stm32_bare_lib) as a cheap and popular platform. It only has 20KB of RAM and 64KB of flash, so it's a good device to ensure we can run efficiently on small chips.
-
-It's fairly easy to [buy and wire up a physical board](https://github.com/google/stm32_bare_lib#wiring-up-your-blue-pill), but even if you don't have an actual device, the [Renode project](https://renode.io/) makes it easy to run a faithful emulation on your desktop machine. You'll need [Docker](https://www.docker.com/) installed, but once you have that set up, try running the following command:
-
-`make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=bluepill test`
-
-You should see a similar set of outputs as you did in the previous section, with the addition of some extra Docker logging messages. These are because we're using Docker to run the Renode micro controller emulation tool, and the tests themselves are being run on a simulated STM32F103 device. The communication channels between an embedded device and the host are quite limited, so the test harness looks at the output of the debug log to see if tests have passed, just as it did in the previous section. This makes it a very flexible way to run cross-platform tests, even when a platform has no operating system facilities, as long as it can output debugging text logs.
-
-To understand what's happening here, try running the same depthwise convolution test, but through the emulated device test harness, with the following command:
+The goal of this library is to enable machine learning on resource-constrained
+micro controllers and DSPs, and as part of that we've targeted the
+["Blue Pill" STM32F103-compatible development board](https://github.com/google/stm32_bare_lib)
+as a cheap and popular platform. It only has 20KB of RAM and 64KB of flash, so
+it's a good device to ensure we can run efficiently on small chips.
+
+It's fairly easy to
+[buy and wire up a physical board](https://github.com/google/stm32_bare_lib#wiring-up-your-blue-pill),
+but even if you don't have an actual device, the
+[Renode project](https://renode.io/) makes it easy to run a faithful emulation
+on your desktop machine. You'll need [Docker](https://www.docker.com/)
+installed, but once you have that set up, try running the following command:
+
+`make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=bluepill
+test`
+
+You should see a similar set of outputs as you did in the previous section, with
+the addition of some extra Docker logging messages. These are because we're
+using Docker to run the Renode micro controller emulation tool, and the tests
+themselves are being run on a simulated STM32F103 device. The communication
+channels between an embedded device and the host are quite limited, so the test
+harness looks at the output of the debug log to see if tests have passed, just
+as it did in the previous section. This makes it a very flexible way to run
+cross-platform tests, even when a platform has no operating system facilities,
+as long as it can output debugging text logs.
+
+To understand what's happening here, try running the same depthwise convolution
+test, but through the emulated device test harness, with the following command:
 
 ```
 tensorflow/lite/experimental/micro/testing/test_bluepill_binary.sh \
@@ -115,7 +235,7 @@ LOGS:
 03:27:32.4834 [DEBUG] cpu.uartSemihosting: [+0.18ms host +0s virt 0s virt from start]   Testing SimpleTestReluQuantized
 03:27:32.4838 [DEBUG] cpu.uartSemihosting: [+0.4ms host +0s virt 0s virt from start]   4/4 tests passed
 03:27:32.4839 [DEBUG] cpu.uartSemihosting: [+41µs host +0s virt 0s virt from start]   ~~~ALL TESTS PASSED~~~
-03:27:32.4839 [DEBUG] cpu.uartSemihosting: [+5µs host +0s virt 0s virt from start]   
+03:27:32.4839 [DEBUG] cpu.uartSemihosting: [+5µs host +0s virt 0s virt from start]
 ...
 tensorflow/lite/experimental/micro/tools/make/gen/bluepill_cortex-m3/bin/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test: PASS
 ```
@@ -128,16 +248,19 @@ than your desktop. We hope that the simplicity of this testing approach will
 help make adding support for new platforms as easy as possible.
 
 ## Building for "Hifive1" SiFive FE310 development board
-We've targeted the ["HiFive1" Arduino-compatible development board](https://www.sifive.com/boards/hifive1) as a test platform for RISC-V MCU.
 
-Similar to Blue Pill setup, you will need Docker installed. The binary can be executed on either HiFive1 board or emulated using [Renode project](https://renode.io/) on your desktop machine.
+We've targeted the
+["HiFive1" Arduino-compatible development board](https://www.sifive.com/boards/hifive1)
+as a test platform for RISC-V MCU.
+
+Similar to Blue Pill setup, you will need Docker installed. The binary can be
+executed on either HiFive1 board or emulated using
+[Renode project](https://renode.io/) on your desktop machine.
 
 The following instructions builds and transfers the source files to the Docker
-```
-docker build -t riscv_build \
--f {PATH_TO_TENSORFLOW_ROOT_DIR}/tensorflow/lite/experimental/micro/testing/Dockerfile.riscv \
-{PATH_TO_TENSORFLOW_ROOT_DIR}/tensorflow/lite/experimental/micro/testing/
-```
+`docker build -t riscv_build \ -f
+{PATH_TO_TENSORFLOW_ROOT_DIR}/tensorflow/lite/experimental/micro/testing/Dockerfile.riscv
+\ {PATH_TO_TENSORFLOW_ROOT_DIR}/tensorflow/lite/experimental/micro/testing/`
 
 You should see output that looks something like this:
 
@@ -160,18 +283,25 @@ Successfully tagged riscv_build:latest
 
 Building micro_speech_test binary
 
- - Lauch the Docker that we just created using: `docker run -it-v /tmp/copybara_out:/workspace riscv_build:latest bash`
- - Enter the source root directory by running `cd /workspace`
- - Download the dependencies by running `./tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh`. This may take a few minutes.
- - Set the path to RISC-V tools: `export PATH=${PATH}:/workspace/tensorflow/lite/experimental/micro/tools/make/downloads/riscv_toolchain/bin/`
- - Build the binary: `make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=riscv32_mcu`
+-   Lauch the Docker that we just created using: `docker run -it-v
+    /tmp/copybara_out:/workspace riscv_build:latest bash`
+-   Enter the source root directory by running `cd /workspace`
+-   Download the dependencies by running
+    `./tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh`.
+    This may take a few minutes.
+-   Set the path to RISC-V tools: `export
+    PATH=${PATH}:/workspace/tensorflow/lite/experimental/micro/tools/make/downloads/riscv_toolchain/bin/`
+-   Build the binary: `make -f
+    tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=riscv32_mcu`
 
 Lauching Renode to test the binary, currently this set up is not automated.
 
- - Until https://github.com/renode/renode/pull/30 is in the Docker image, patch the change manully: `sed -E -i 's/"rv32g"/"rv32imac"/g' /opt/renode/platforms/cpus/sifive-fe310.repl`
-
+-   Until https://github.com/renode/renode/pull/30 is in the Docker image, patch
+    the change manully: `sed -E -i 's/"rv32g"/"rv32imac"/g'
+    /opt/renode/platforms/cpus/sifive-fe310.repl`
 
- - Execute the binary on Renode: `renode -P 5000 --disable-xwt -e 's @/workspace/tensorflow/lite/experimental/micro/testing/sifive_fe310.resc'`
+-   Execute the binary on Renode: `renode -P 5000 --disable-xwt -e 's
+    @/workspace/tensorflow/lite/experimental/micro/testing/sifive_fe310.resc'`
 
 You should see the following log with the magic string `~~~ALL TEST PASSED~~~`:
 
@@ -187,5 +317,75 @@ You should see the following log with the magic string `~~~ALL TEST PASSED~~~`:
 02:25:22.4253 [DEBUG] uart0: [+0.16ms host +0s virt 0.28s virt from start]   Progam has exited with code:0x00000000
 ```
 
+## Building for Ambiq Micro Apollo3Blue EVB
+
+Follow these steps to get the pushbutton yes/no example working on Apollo 3:
+
+1.  Make sure to run the "Getting Started" section before performing the
+    following steps
+2.  Download Apollo3-SDK-2018.08.13 and place in
+    `tensorflow/lite/experimental/micro/tools/make/downloads`. This is not yet
+    publicly released, but you can contact ashah@ambiqmicro.com to request a
+    copy.
+3.  Compile the project with the following command: make -f
+    tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=apollo3evb
+    pushbutton_cmsis_speech_test_bin
+4.  Install [Segger JLink tools](https://www.segger.com/downloads/jlink/)
+5.  Connect the Apollo3 EVB (with mic shield in slot 3 of Microbus Shield board)
+    to the computer and power it on.
+6.  Start the GDB server in a new terminal with the following command:
+    JLinkGDBServer -select USB -device AMA3B1KK-KBR -endian little -if SWD
+    -speed 1000 -noir -noLocalhostOnly
+    1.  The command has run successfully if you see the message "Waiting for GDB
+        connection"
+7.  Back in the original terminal, run the program via the debugger
+    1.  Navigate to
+        tensorflow/lite/experimental/micro/examples/micro_speech/apollo3
+    2.  Start gdb by entering the following command: arm-none-eabi-gdb
+    3.  Run the command script by entering the following command: source
+        pushbutton_cmsis_scores.cmd. This script does the following:
+        1.  Load the binary created in step 6
+        2.  Set a breakpoint after inference scores have been computed
+        3.  Tell the debugger what variables should be printed out at this
+            breakpoint
+        4.  Begin program execution
+        5.  Press Ctrl+c to exit
+    4.  Press BTN2. An LED will flash for 1 second. Speak your utterance during
+        this one second
+    5.  The debugger will print out four numbers. They are the probabilites for
+        1) no speech, 2) unknown speech, 3) yes, 4) no
+    6.  The EVB LEDs will indicate detection.
+        1.  LED0 (rightmost LED) - ON when capturing 1sec of audio
+        2.  LED1 - ON when detecting silence
+        3.  LED2 - ON when detecting UNKNOWN utterance
+        4.  LED3 - ON when detecting YES utterance
+        5.  LED4 (leftmost LED) - ON when detecting NO utterance
+
+### Additional Apollo3 Instructions
+
+To flash a part with JFlash Lite, do the following: 1. At the command line:
+JFlashLiteExe 2. Device = AMA3B1KK-KBR 3. Interface = SWD at 1000 kHz 4. Data
+file =
+tensorflow/lite/experimental/micro/tools/make/gen/apollo3evb_cortex-m4/bin/pushbutton_cmsis_speech_test.bin
+5. Prog Addr = 0x0000C000
+
+## Generating Project Files
+
+It's not always easy or convenient to use a makefile-based build process,
+especially if you're working on a product that uses a different IDE for the rest
+of its code. To address that, it's possible to generate standalone project
+folders for various popular build systems. These projects are self-contained,
+with only the headers and source files needed by a particular binary, and
+include project files to make loading them into an IDE easy. These can be
+auto-generated for any target you can compile using the main Make system, using
+a command like this (making sure you've run `download_dependencies.sh` first):
 
+```
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=mbed TAGS="CMSIS disco_f746ng" generate_micro_speech_mbed_project
+```
 
+This will create a folder in
+`tensorflow/lite/experimental/micro/tools/make/gen/mbed_cortex-m4/prj/micro_speech_main_test/mbed`
+that contains the source and header files, some Mbed configuration files, and a
+README. You should then be able to copy this directory to another machine, and
+use it just like any other Mbed project.
diff --git a/tensorflow/lite/experimental/micro/bluepill/debug_log.cc b/tensorflow/lite/experimental/micro/bluepill/debug_log.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4812a918498ee2ab52e114bce9ca0cf3919b2254
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/bluepill/debug_log.cc
@@ -0,0 +1,27 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/debug_log.h"
+
+// For Arm Cortex-M devices, calling SYS_WRITE0 will output the zero-terminated
+// string pointed to by R1 to any debug console that's attached to the system.
+extern "C" void DebugLog(const char* s) {
+  asm("mov r0, #0x04\n"  // SYS_WRITE0
+      "mov r1, %[str]\n"
+      "bkpt #0xAB\n"
+      :
+      : [ str ] "r"(s)
+      : "r0", "r1");
+}
diff --git a/tensorflow/lite/experimental/micro/debug_log.cc b/tensorflow/lite/experimental/micro/debug_log.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d4ca44d76b73020848e9757c230d7bf69ff5aaa
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/debug_log.cc
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Reference implementation of the DebugLog() function that's required for a
+// platform to support the TensorFlow Lite for Microcontrollers library. This is
+// the only function that's absolutely required to be available on a target
+// device, since it's used for communicating test results back to the host so
+// that we can verify the implementation is working correctly.
+// It's designed to be as easy as possible to supply an implementation though.
+// On platforms that have a POSIX stack or C library, it can be written as a
+// single call to `fprintf(stderr, "%s", s)` to output a string to the error
+// stream of the console, but if there's no OS or C library available, there's
+// almost always an equivalent way to write out a string to some serial
+// interface that can be used instead. For example on Arm M-series MCUs, calling
+// the `bkpt #0xAB` assembler instruction will output the string in r1 to
+// whatever debug serial connection is available. If you're running mbed, you
+// can do the same by creating `Serial pc(USBTX, USBRX)` and then calling
+// `pc.printf("%s", s)`.
+// To add an equivalent function for your own platform, create your own
+// implementation file, and place it in a subfolder with named after the OS
+// you're targeting. For example, see the Cortex M bare metal version in
+// tensorflow/lite/experimental/micro/bluepill/debug_log.cc or the mbed one on
+// tensorflow/lite/experimental/micro/mbed/debug_log.cc.
+
+#include "tensorflow/lite/experimental/micro/debug_log.h"
+
+#include <cstdio>
+
+extern "C" void DebugLog(const char* s) { fprintf(stderr, "%s", s); }
diff --git a/tensorflow/lite/experimental/micro/debug_log.h b/tensorflow/lite/experimental/micro/debug_log.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0e395c3760e2e0c57b50c38c05737dfecb7e680
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/debug_log.h
@@ -0,0 +1,23 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_DEBUG_LOG_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_DEBUG_LOG_H_
+
+// This function should be implemented by each target platform, and provide a
+// way for strings to be output to some text stream. For more information, see
+// tensorflow/lite/experimental/micro/debug_log.cc.
+extern "C" void DebugLog(const char* s);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_DEBUG_LOG_H_
diff --git a/tensorflow/lite/experimental/micro/debug_log_numbers.cc b/tensorflow/lite/experimental/micro/debug_log_numbers.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8e86730674859d5560e5ec6b243e40c95f88bf4f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/debug_log_numbers.cc
@@ -0,0 +1,185 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Implements debug logging for numbers by converting them into strings and then
+// calling the main DebugLog(char*) function. These are separated into a
+// different file so that platforms can just implement the string output version
+// of DebugLog() and then get the numerical variations without requiring any
+// more code.
+
+#include "tensorflow/lite/experimental/micro/debug_log_numbers.h"
+
+#include "tensorflow/lite/experimental/micro/debug_log.h"
+
+namespace {
+
+// All input buffers to the number conversion functions must be this long.
+static const int kFastToBufferSize = 48;
+
+// Reverses a zero-terminated string in-place.
+char* ReverseStringInPlace(char* start, char* end) {
+  char* p1 = start;
+  char* p2 = end - 1;
+  while (p1 < p2) {
+    char tmp = *p1;
+    *p1++ = *p2;
+    *p2-- = tmp;
+  }
+  return start;
+}
+
+// Appends a string to a string, in-place. You need to pass in the maximum
+// string length as the second argument.
+char* StrCatStr(char* main, int main_max_length, const char* to_append) {
+  char* current = main;
+  while (*current != 0) {
+    ++current;
+  }
+  char* current_end = main + (main_max_length - 1);
+  while ((*to_append != 0) && (current < current_end)) {
+    *current = *to_append;
+    ++current;
+    ++to_append;
+  }
+  *current = 0;
+  return current;
+}
+
+// Populates the provided buffer with an ASCII representation of the number.
+char* FastUInt32ToBufferLeft(uint32_t i, char* buffer, int base) {
+  char* start = buffer;
+  do {
+    int32_t digit = i % base;
+    char character;
+    if (digit < 10) {
+      character = '0' + digit;
+    } else {
+      character = 'a' + (digit - 10);
+    }
+    *buffer++ = character;
+    i /= base;
+  } while (i > 0);
+  *buffer = 0;
+  ReverseStringInPlace(start, buffer);
+  return buffer;
+}
+
+// Populates the provided buffer with an ASCII representation of the number.
+char* FastInt32ToBufferLeft(int32_t i, char* buffer) {
+  uint32_t u = i;
+  if (i < 0) {
+    *buffer++ = '-';
+    u = -u;
+  }
+  return FastUInt32ToBufferLeft(u, buffer, 10);
+}
+
+// Converts a number to a string and appends it to another.
+char* StrCatInt32(char* main, int main_max_length, int32_t number) {
+  char number_string[kFastToBufferSize];
+  FastInt32ToBufferLeft(number, number_string);
+  return StrCatStr(main, main_max_length, number_string);
+}
+
+// Converts a number to a string and appends it to another.
+char* StrCatUInt32(char* main, int main_max_length, uint32_t number, int base) {
+  char number_string[kFastToBufferSize];
+  FastUInt32ToBufferLeft(number, number_string, base);
+  return StrCatStr(main, main_max_length, number_string);
+}
+
+// Populates the provided buffer with ASCII representation of the float number.
+// Avoids the use of any floating point instructions (since these aren't
+// supported on many microcontrollers) and as a consequence prints values with
+// power-of-two exponents.
+char* FastFloatToBufferLeft(float f, char* buffer) {
+  char* current = buffer;
+  char* current_end = buffer + (kFastToBufferSize - 1);
+  // Access the bit fields of the floating point value to avoid requiring any
+  // float instructions. These constants are derived from IEEE 754.
+  const uint32_t sign_mask = 0x80000000;
+  const uint32_t exponent_mask = 0x7f800000;
+  const int32_t exponent_shift = 23;
+  const int32_t exponent_bias = 127;
+  const uint32_t fraction_mask = 0x007fffff;
+  const uint32_t u = *reinterpret_cast<uint32_t*>(&f);
+  const int32_t exponent =
+      ((u & exponent_mask) >> exponent_shift) - exponent_bias;
+  const uint32_t fraction = (u & fraction_mask);
+  // Expect ~0x2B1B9D3 for fraction.
+  if (u & sign_mask) {
+    *current = '-';
+    current += 1;
+  }
+  *current = 0;
+  // These are special cases for infinities and not-a-numbers.
+  if (exponent == 128) {
+    if (fraction == 0) {
+      current = StrCatStr(current, (current_end - current), "Inf");
+      return current;
+    } else {
+      current = StrCatStr(current, (current_end - current), "NaN");
+      return current;
+    }
+  }
+  // 0x007fffff (8388607) represents 0.99... for the fraction, so to print the
+  // correct decimal digits we need to scale our value before passing it to the
+  // conversion function. This scale should be 10000000/8388608 = 1.1920928955.
+  // We can approximate this using multiply-adds and right-shifts using the
+  // values in this array. The 1. portion of the number string is printed out
+  // in a fixed way before the fraction, below.
+  const int32_t scale_shifts_size = 13;
+  const int8_t scale_shifts[13] = {3,  4,  8,  11, 13, 14, 17,
+                                   18, 19, 20, 21, 22, 23};
+  uint32_t scaled_fraction = fraction;
+  for (int i = 0; i < scale_shifts_size; ++i) {
+    scaled_fraction += (fraction >> scale_shifts[i]);
+  }
+  *current = '1';
+  current += 1;
+  *current = '.';
+  current += 1;
+  *current = 0;
+  current = StrCatUInt32(current, (current_end - current), scaled_fraction, 10);
+  current = StrCatStr(current, (current_end - current), "*2^");
+  current = StrCatInt32(current, (current_end - current), exponent);
+  return current;
+}
+
+}  // namespace
+
+extern "C" void DebugLogInt32(int32_t i) {
+  char number_string[kFastToBufferSize];
+  FastInt32ToBufferLeft(i, number_string);
+  DebugLog(number_string);
+}
+
+extern "C" void DebugLogUInt32(uint32_t i) {
+  char number_string[kFastToBufferSize];
+  FastUInt32ToBufferLeft(i, number_string, 10);
+  DebugLog(number_string);
+}
+
+extern "C" void DebugLogHex(uint32_t i) {
+  char number_string[kFastToBufferSize];
+  FastUInt32ToBufferLeft(i, number_string, 16);
+  DebugLog(number_string);
+}
+
+extern "C" void DebugLogFloat(float i) {
+  char number_string[kFastToBufferSize];
+  FastFloatToBufferLeft(i, number_string);
+  DebugLog(number_string);
+}
diff --git a/tensorflow/lite/experimental/micro/debug_log_numbers.h b/tensorflow/lite/experimental/micro/debug_log_numbers.h
new file mode 100644
index 0000000000000000000000000000000000000000..d889e751730495e2d1bf6232e7b9c2cbb76c9667
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/debug_log_numbers.h
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_DEBUG_LOG_NUMBERS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_DEBUG_LOG_NUMBERS_H_
+
+#include <cstdint>
+
+// Output numbers to the debug logging stream.
+extern "C" {
+void DebugLogInt32(int32_t i);
+void DebugLogUInt32(uint32_t i);
+void DebugLogHex(uint32_t i);
+void DebugLogFloat(float i);
+}
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_DEBUG_LOG_NUMBERS_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/.gitignore b/tensorflow/lite/experimental/micro/examples/micro_speech/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..d8dd7532abcc65af52e9db03c516274e3d674dc1
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/.gitignore
@@ -0,0 +1 @@
+*.wav
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD b/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
index 799b2e5a5dd097c6e017f574449d339992f7c41b..702b893e6f433526e77ca20b4d9468ddb7da59bc 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
@@ -176,7 +176,6 @@ cc_library(
         ":audio_provider",
         ":model_settings",
         ":preprocessor_reference",
-        ":timer",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
     ],
@@ -191,7 +190,6 @@ tflite_micro_cc_test(
         ":audio_provider",
         ":feature_provider",
         ":model_settings",
-        ":timer",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
@@ -199,22 +197,27 @@ tflite_micro_cc_test(
 )
 
 cc_library(
-    name = "timer",
+    name = "recognize_commands",
     srcs = [
-        "timer.cc",
+        "recognize_commands.cc",
     ],
     hdrs = [
-        "timer.h",
+        "recognize_commands.h",
+    ],
+    deps = [
+        ":model_settings",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
     ],
 )
 
 tflite_micro_cc_test(
-    name = "timer_test",
+    name = "recognize_commands_test",
     srcs = [
-        "timer_test.cc",
+        "recognize_commands_test.cc",
     ],
     deps = [
-        ":timer",
+        ":recognize_commands",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
@@ -232,7 +235,7 @@ cc_binary(
         ":features_test_data",
         ":model_settings",
         ":preprocessor_reference",
-        ":timer",
+        ":recognize_commands",
         ":tiny_conv_model_data",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/experimental/micro:micro_framework",
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/Makefile.inc b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/Makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..3d560510ad140ff0bba84ebcf790a0fda90e72fa
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/Makefile.inc
@@ -0,0 +1,43 @@
+# Settings for targets that use the CMSIS library.
+ifneq ($(filter CMSIS,$(ALL_TAGS)),)
+  INCLUDES += \
+    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
+    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Include/ \
+    -I$(MAKEFILE_DIR)/downloads/CMSIS_ext/
+
+  CMSIS_PREPROCESSOR_SRCS := \
+    tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.cc \
+    tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.cc \
+
+  CMSIS_PREPROCESSOR_HDRS := \
+    tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h \
+    tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.h \
+    third_party/CMSIS_ext/arm_cmplx_mag_squared_q10p6.h
+
+  PREPROCESSOR_TEST_SRCS += $(CMSIS_PREPROCESSOR_SRCS)
+  PREPROCESSOR_TEST_HDRS += $(CMSIS_PREPROCESSOR_HDRS)
+
+  FEATURE_PROVIDER_TEST_SRCS += $(CMSIS_PREPROCESSOR_SRCS)
+  FEATURE_PROVIDER_TEST_HDRS += $(CMSIS_PREPROCESSOR_HDRS)
+
+  MICRO_SPEECH_SRCS += $(CMSIS_PREPROCESSOR_SRCS)
+  MICRO_SPEECH_HDRS += $(CMSIS_PREPROCESSOR_HDRS)
+
+  THIRD_PARTY_CC_SRCS += \
+    third_party/CMSIS_ext/arm_cmplx_mag_squared_q10p6.c \
+    third_party/cmsis/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_q15.c \
+    third_party/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q15.c \
+    third_party/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q15.c \
+    third_party/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c \
+    third_party/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c \
+    third_party/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.S \
+    third_party/cmsis/CMSIS/DSP/Source/CommonTables/arm_const_structs.c \
+    third_party/cmsis/CMSIS/DSP/Source/CommonTables/arm_common_tables.c \
+    third_party/cmsis/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q15.c \
+    third_party/cmsis/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q7.c
+
+  THIRD_PARTY_CC_HDRS += \
+    third_party/cmsis/CMSIS/DSP/Include/arm_common_tables.h \
+    third_party/cmsis/CMSIS/DSP/Include/arm_const_structs.h
+
+endif
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/README.md b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..65aec34a1f7991fad33a61a12eddd414577c666d
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/README.md
@@ -0,0 +1,23 @@
+# Description of files
+
+*   **create_constants.py**: Python file used to create hanning.cc, hanning.h,
+    sin_1k.cc, and sin_1k.h
+*   **hanning.cc**: Precomputed
+    [Hann window](https://en.wikipedia.org/wiki/Hann_function) for use in the
+    preprocessor. This file is created in ../create_constants.py
+*   **hanning.h**: Header file fro hanning.cc
+*   **preprocessor.cc**: CMSIS version of the preprocessor
+*   **sin_1k.cc**: A 1 kHZ sinusoid used for comparing the CMSIS preprocessor
+    with the Micro-Lite fixed_point preprocessor
+*   **sin_1k.h**: Header file for sin_1k.cc
+
+# Description of externally downloaded files in ../CMSIS_ext
+
+*   **arm_cmplx_mag_squared_q10p6.c**: Modified version of the ARM CMSIS
+    function
+    [arm_cmplx_mag_squared.c](http://arm-software.github.io/CMSIS_5/DSP/html/group__cmplx__mag__squared.html#ga45537f576102d960d467eb722b8431f2).
+    The modification is that we have changed the amount of right-shift to make
+    sure our data is in the correct range. We redistribute because the original
+    content was created with the Apache 2.0 license.
+*   **arm_cmplx_mag_squared_q10p6.h**: Header file for
+    arm_cmplx_mag_squared_q10p6.c
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/create_constants.py b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/create_constants.py
new file mode 100755
index 0000000000000000000000000000000000000000..daf7e3cde89a0380cbbcae6ddc88859c8e87ffb9
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/create_constants.py
@@ -0,0 +1,75 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Outputs tables used for fast calculations at runtime."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# import soundfile as sf
+import numpy as np
+
+
+def to_cc(x, varname, directory='', scale_factor=1):
+  """Writes table values to a C++ source file."""
+  x = (x / np.max(np.abs(x))) * 32768 * scale_factor
+  x[x > 32767] = 32767
+  x[x < -32768] = -32768
+  x = x.astype(int)
+  x = [str(v) if i % 10 != 0 else '\n    ' + str(v) for i, v in enumerate(x)]
+
+  cmsis_path = 'tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS'
+  xstr = '#include "{}/{}.h"\n\n'.format(cmsis_path, varname)
+  xstr += 'const int g_{}_size = {};\n'.format(varname, len(x))
+  xstr += 'const int16_t g_{}[{}] = {{{}}};\n'.format(varname, len(x),
+                                                      ', '.join(x))
+
+  with open(directory + varname + '.cc', 'w') as f:
+    f.write(xstr)
+
+
+def to_h(_, varname, directory=''):
+  """Writes a header file for the table values."""
+  tf_prepend = 'TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_'
+  xstr = '#ifndef {}{}_H_\n'.format(tf_prepend, varname.upper())
+  xstr += '#define {}{}_H_\n\n'.format(tf_prepend, varname.upper())
+  xstr += '#include <cstdint>\n\n'
+  xstr += 'extern const int g_{}_size;\n'.format(varname)
+  xstr += 'extern const int16_t g_{}[];\n\n'.format(varname)
+  xstr += '#endif'
+
+  with open(directory + varname + '.h', 'w') as f:
+    f.write(xstr)
+
+
+# x = sf.read('yes_f2e59fea_nohash_1.wav')[0]
+# to_cc(x, 'yes_waveform')
+# to_h(x, 'yes_waveform')
+#
+# x = sf.read('no_f9643d42_nohash_4.wav')[0]
+# to_cc(x, 'no_waveform')
+# to_h(x, 'no_waveform')
+
+# 30ms of data @ 16 kHz = 480 samples
+hann = np.hanning(int(16000 * 0.03))  # Window 30ms of data
+to_cc(hann, 'hanning', directory='./')
+to_h(hann, 'hanning', directory='./')
+
+t = np.arange(16000. * 0.03) / 16000.
+sin1k = np.sin(
+    2 * np.pi * 1000 *
+    t)  # Factor of 10 because micro preprocessing overflows otherwise
+to_cc(sin1k, 'sin_1k', directory='./', scale_factor=0.1)
+to_h(sin1k, 'sin_1k', directory='./')
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e6a11ce52c6b41a9f6fcbfc5a31bf7e0da8361cf
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.cc
@@ -0,0 +1,63 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h"
+
+const int g_hanning_size = 480;
+const int16_t g_hanning[480] = {
+    0,     1,     5,     12,    22,    35,    50,    69,    90,    114,   140,
+    170,   202,   237,   275,   316,   359,   405,   454,   506,   560,   617,
+    677,   740,   805,   873,   943,   1016,  1092,  1171,  1252,  1336,  1422,
+    1511,  1602,  1696,  1793,  1892,  1993,  2097,  2204,  2312,  2424,  2537,
+    2653,  2772,  2893,  3016,  3141,  3269,  3399,  3531,  3665,  3802,  3941,
+    4082,  4225,  4370,  4517,  4666,  4817,  4971,  5126,  5283,  5442,  5603,
+    5765,  5930,  6096,  6265,  6435,  6606,  6779,  6954,  7131,  7309,  7489,
+    7670,  7853,  8037,  8223,  8410,  8598,  8788,  8979,  9171,  9365,  9560,
+    9756,  9953,  10151, 10350, 10551, 10752, 10954, 11157, 11362, 11567, 11772,
+    11979, 12186, 12395, 12603, 12813, 13023, 13233, 13445, 13656, 13868, 14081,
+    14294, 14507, 14721, 14935, 15149, 15363, 15578, 15793, 16008, 16222, 16437,
+    16652, 16867, 17082, 17297, 17511, 17725, 17939, 18153, 18367, 18580, 18793,
+    19005, 19217, 19428, 19639, 19850, 20059, 20269, 20477, 20685, 20892, 21098,
+    21303, 21508, 21712, 21914, 22116, 22317, 22517, 22716, 22913, 23110, 23305,
+    23499, 23692, 23884, 24075, 24264, 24451, 24638, 24823, 25006, 25188, 25369,
+    25548, 25725, 25901, 26075, 26247, 26418, 26587, 26754, 26920, 27083, 27245,
+    27405, 27563, 27719, 27874, 28026, 28176, 28324, 28470, 28614, 28756, 28896,
+    29034, 29169, 29303, 29434, 29563, 29689, 29813, 29935, 30055, 30172, 30287,
+    30400, 30510, 30617, 30723, 30825, 30926, 31023, 31119, 31211, 31301, 31389,
+    31474, 31556, 31636, 31713, 31788, 31860, 31929, 31996, 32059, 32121, 32179,
+    32235, 32288, 32338, 32386, 32430, 32472, 32512, 32548, 32582, 32613, 32641,
+    32666, 32689, 32708, 32725, 32739, 32751, 32759, 32765, 32767, 32767, 32765,
+    32759, 32751, 32739, 32725, 32708, 32689, 32666, 32641, 32613, 32582, 32548,
+    32512, 32472, 32430, 32386, 32338, 32288, 32235, 32179, 32121, 32059, 31996,
+    31929, 31860, 31788, 31713, 31636, 31556, 31474, 31389, 31301, 31211, 31119,
+    31023, 30926, 30825, 30723, 30617, 30510, 30400, 30287, 30172, 30055, 29935,
+    29813, 29689, 29563, 29434, 29303, 29169, 29034, 28896, 28756, 28614, 28470,
+    28324, 28176, 28026, 27874, 27719, 27563, 27405, 27245, 27083, 26920, 26754,
+    26587, 26418, 26247, 26075, 25901, 25725, 25548, 25369, 25188, 25006, 24823,
+    24638, 24451, 24264, 24075, 23884, 23692, 23499, 23305, 23110, 22913, 22716,
+    22517, 22317, 22116, 21914, 21712, 21508, 21303, 21098, 20892, 20685, 20477,
+    20269, 20059, 19850, 19639, 19428, 19217, 19005, 18793, 18580, 18367, 18153,
+    17939, 17725, 17511, 17297, 17082, 16867, 16652, 16437, 16222, 16008, 15793,
+    15578, 15363, 15149, 14935, 14721, 14507, 14294, 14081, 13868, 13656, 13445,
+    13233, 13023, 12813, 12603, 12395, 12186, 11979, 11772, 11567, 11362, 11157,
+    10954, 10752, 10551, 10350, 10151, 9953,  9756,  9560,  9365,  9171,  8979,
+    8788,  8598,  8410,  8223,  8037,  7853,  7670,  7489,  7309,  7131,  6954,
+    6779,  6606,  6435,  6265,  6096,  5930,  5765,  5603,  5442,  5283,  5126,
+    4971,  4817,  4666,  4517,  4370,  4225,  4082,  3941,  3802,  3665,  3531,
+    3399,  3269,  3141,  3016,  2893,  2772,  2653,  2537,  2424,  2312,  2204,
+    2097,  1993,  1892,  1793,  1696,  1602,  1511,  1422,  1336,  1252,  1171,
+    1092,  1016,  943,   873,   805,   740,   677,   617,   560,   506,   454,
+    405,   359,   316,   275,   237,   202,   170,   140,   114,   90,    69,
+    50,    35,    22,    12,    5,     1,     0};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/timer.h b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h
similarity index 54%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/timer.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h
index 162952844a832ebd0b0273d13a929fec6fa22892..e7d9c5c85866988469f96a444c503863bc2bef4c 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/timer.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h
@@ -13,19 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TIMER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TIMER_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_HANNING_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_HANNING_H_
 
 #include <cstdint>
 
-// Returns the time in milliseconds. There's no contract about what time zero
-// represents, the accuracy, or the granularity of the result. Subsequent calls
-// will generally not return a lower value, but even that's not guaranteed if
-// there's an overflow  wraparound.
-// The reference implementation of this function just returns a constantly
-// incrementing value for each call, since it would need a non-portable platform
-// call to access time information. For real applications, you'll need to write
-// your own platform-specific implementation.
-int32_t TimeInMilliseconds();
+extern const int g_hanning_size;
+extern const int16_t g_hanning[];
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TIMER_H_
+#endif
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5c6978b5edef635af58873bf537a251fa4510ef4
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor.cc
@@ -0,0 +1,105 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+extern "C" {
+#define ARM_MATH_CM4
+#define IFFT_FLAG_R 0
+#define BIT_REVERSE_FLAG 1
+#define FFT_SIZE 512
+#define FFT_SIZE_DIV2 256
+#include <arm_math.h>
+#include "arm_cmplx_mag_squared_q10p6.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h"
+}
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
+
+void quantize(q15_t* bufA, q15_t* bufB, uint8_t* output);
+
+q15_t bufA[FFT_SIZE];
+q15_t bufB[FFT_SIZE];
+arm_rfft_instance_q15 S_arm_fft;
+arm_status arm_math_status;
+
+namespace {
+// These constants allow us to allocate fixed-sized arrays on the stack for our
+// working memory.
+constexpr int kInputSize = 512;
+constexpr int kAverageWindowSize = 6;
+constexpr int kOutputSize =
+    ((kInputSize / 2) + (kAverageWindowSize - 1)) / kAverageWindowSize;
+}  // namespace
+
+TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
+                        const int16_t* input, int input_size, int output_size,
+                        uint8_t* output) {
+  if (input_size > kInputSize) {
+    error_reporter->Report("Input size %d larger than %d", input_size,
+                           kInputSize);
+    return kTfLiteError;
+  }
+  if (output_size != kOutputSize) {
+    error_reporter->Report("Requested output size %d doesn't match %d",
+                           output_size, kOutputSize);
+    return kTfLiteError;
+  }
+
+  // 30ms at 16 kHz = 480 samples
+  // We want to pad the rest of the 512-sample buffer with zeros
+  arm_mult_q15((q15_t*)input, g_hanning, bufB, 480);
+  int i;
+  for (i = 480; i < 512; i++) {
+    bufB[i] = 0;
+  }
+
+  // Should move init code outside of Preprocess() function
+  arm_math_status =
+      arm_rfft_init_q15(&S_arm_fft, FFT_SIZE, IFFT_FLAG_R, BIT_REVERSE_FLAG);
+  arm_rfft_q15(&S_arm_fft, bufB, bufA);
+
+  // The rfft function packs data as follows:
+  // {real[0], real[N/2], real[1], imag[1], ..., real[N/2-1], imag[N/2-1]}
+  // Below we pack as follows:
+  // {real[0], 0, real[1], imag[1], ..., real[N/2-1], imag[N/2-1, real[N/2], 0}
+  bufA[FFT_SIZE_DIV2] = bufA[1];
+  bufA[FFT_SIZE_DIV2 + 1] = 0;
+  bufA[1] = 0;
+  arm_cmplx_mag_squared_q10p6(bufA, bufB, FFT_SIZE_DIV2 + 1);
+
+  quantize(bufA, bufB, output);
+
+  return kTfLiteOk;
+}
+
+void quantize(q15_t* bufA, q15_t* bufB, uint8_t* output) {
+  int i;
+  for (i = 0; i < 42; i++) {
+    arm_mean_q15(bufB + 6 * i, 6, bufA + i);
+  }
+  arm_mean_q15(bufB + 252, 5, bufA + 42);
+
+  for (i = 0; i < 43; i++) {
+    output[i] = (uint8_t)(bufA[i] >> 5);
+  }
+}
+
+TfLiteStatus Preprocess_1sec(tflite::ErrorReporter* error_reporter,
+                             const int16_t* input, uint8_t* output) {
+  int i;
+  for (i = 0; i < 49; i++) {
+    Preprocess(error_reporter, input + i * 320, 480, 43, output + i * 43);
+  }
+  return kTfLiteOk;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.cc
new file mode 100644
index 0000000000000000000000000000000000000000..45e9f798ef04cf40268cf379f24ecbfa904be9b5
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.cc
@@ -0,0 +1,63 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.h"
+
+const int g_sin_1k_size = 480;
+const int16_t g_sin_1k[480] = {
+    0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317,
+    -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,
+    2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,
+    1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027,
+    -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,
+    1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,
+    2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276,
+    -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,
+    0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,
+    3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027,
+    -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,
+    -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,
+    3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317,
+    -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253,
+    -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,
+    3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253,
+    0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317,
+    -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,
+    2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,
+    1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027,
+    -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,
+    1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,
+    2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276,
+    -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,
+    0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,
+    3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027,
+    -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,
+    -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,
+    3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317,
+    -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253,
+    -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,
+    3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253,
+    0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317,
+    -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,
+    2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,
+    1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027,
+    -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,
+    1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,
+    2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276,
+    -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,
+    0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,
+    3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027,
+    -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,
+    -1253, -2317, -3027, -3276, -3027, -2317, -1253};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.h b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.h
new file mode 100644
index 0000000000000000000000000000000000000000..653a6f583013dc03d0601cfd97a85b15db2c6677
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.h
@@ -0,0 +1,24 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIN_1K_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIN_1K_H_
+
+#include <cstdint>
+
+extern const int g_sin_1k_size;
+extern const int16_t g_sin_1k[];
+
+#endif
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/Makefile.inc b/tensorflow/lite/experimental/micro/examples/micro_speech/Makefile.inc
index 0e42329cade2e4b49b8000412c593f9a442af4ca..49aace3d7d05ba1d7010d3d834c66dc13e488c96 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/Makefile.inc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/Makefile.inc
@@ -1,153 +1,106 @@
 
-# Tests loading and running a speech model.
 MICRO_SPEECH_TEST_SRCS := \
 tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc
-ALL_SRCS += $(MICRO_SPEECH_TEST_SRCS)
-MICRO_SPEECH_TEST_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICRO_SPEECH_TEST_SRCS))))
-MICRO_SPEECH_TEST_BINARY := $(BINDIR)micro_speech_test
-ALL_BINARIES += $(MICRO_SPEECH_TEST_BINARY)
-$(MICRO_SPEECH_TEST_BINARY): $(MICRO_SPEECH_TEST_OBJS) $(MICROLITE_LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(MICRO_SPEECH_TEST_BINARY) $(MICRO_SPEECH_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
-micro_speech_test: $(MICRO_SPEECH_TEST_BINARY)
-micro_speech_test_bin: $(MICRO_SPEECH_TEST_BINARY).bin
-test_micro_speech: $(MICRO_SPEECH_TEST_BINARY)
-	$(TEST_SCRIPT) $(MICRO_SPEECH_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
-
-# Source files that are used by multiple preprocessor tests.
-PREPROCESSOR_TEST_SHARED_SRCS := \
+
+MICRO_SPEECH_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h \
+
+PREPROCESSOR_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc
 
-# Test the float reference code for feature generation.
-PREPROCESSOR_REFERENCE_TEST_SRCS = \
-$(PREPROCESSOR_TEST_SHARED_SRCS) \
-tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
-ALL_SRCS += $(PREPROCESSOR_REFERENCE_TEST_SRCS)
-PREPROCESSOR_REFERENCE_TEST_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_REFERENCE_TEST_SRCS))))
-PREPROCESSOR_REFERENCE_TEST_BINARY := $(BINDIR)preprocessor_reference_test
-ALL_BINARIES += $(PREPROCESSOR_REFERENCE_TEST_BINARY)
-$(PREPROCESSOR_REFERENCE_TEST_BINARY): $(PREPROCESSOR_REFERENCE_TEST_OBJS) $(MICROLITE_LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(PREPROCESSOR_REFERENCE_TEST_BINARY) $(PREPROCESSOR_REFERENCE_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
-preprocessor_reference_test: $(PREPROCESSOR_REFERENCE_TEST_BINARY)
-preprocessor_reference_test_bin: $(PREPROCESSOR_REFERENCE_TEST_BINARY).bin
-test_preprocessor_reference: $(PREPROCESSOR_REFERENCE_TEST_BINARY)
-	$(TEST_SCRIPT) $(PREPROCESSOR_REFERENCE_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
-
-# Test the fixed point reference code for feature generation.
-PREPROCESSOR_FIXED_TEST_SRCS = \
-$(PREPROCESSOR_TEST_SHARED_SRCS) \
-tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
-ALL_SRCS += $(PREPROCESSOR_FIXED_TEST_SRCS)
-PREPROCESSOR_FIXED_TEST_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_FIXED_TEST_SRCS))))
-PREPROCESSOR_FIXED_TEST_BINARY := $(BINDIR)preprocessor_fixed_test
-ALL_BINARIES += $(PREPROCESSOR_FIXED_TEST_BINARY)
-$(PREPROCESSOR_FIXED_TEST_BINARY): $(PREPROCESSOR_FIXED_TEST_OBJS) $(MICROLITE_LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(PREPROCESSOR_FIXED_TEST_BINARY) $(PREPROCESSOR_FIXED_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
-preprocessor_fixed_test: $(PREPROCESSOR_FIXED_TEST_BINARY)
-preprocessor_fixed_test_bin: $(PREPROCESSOR_FIXED_TEST_BINARY).bin
-test_preprocessor_fixed: $(PREPROCESSOR_FIXED_TEST_BINARY)
-	$(TEST_SCRIPT) $(PREPROCESSOR_FIXED_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+PREPROCESSOR_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.h
 
-# Tests the audio provider module.
 AUDIO_PROVIDER_TEST_SRCS := \
 tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
-ALL_SRCS += $(AUDIO_PROVIDER_TEST_SRCS)
-AUDIO_PROVIDER_TEST_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(AUDIO_PROVIDER_TEST_SRCS))))
-AUDIO_PROVIDER_TEST_BINARY := $(BINDIR)audio_provider_test
-ALL_BINARIES += $(AUDIO_PROVIDER_TEST_BINARY)
-$(AUDIO_PROVIDER_TEST_BINARY): $(AUDIO_PROVIDER_TEST_OBJS) $(MICROLITE_LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(AUDIO_PROVIDER_TEST_BINARY) $(AUDIO_PROVIDER_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
-audio_provider_test: $(AUDIO_PROVIDER_TEST_BINARY)
-audio_provider_test_bin: $(AUDIO_PROVIDER_TEST_BINARY).bin
-test_audio_provider: $(AUDIO_PROVIDER_TEST_BINARY)
-	$(TEST_SCRIPT) $(AUDIO_PROVIDER_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
 
-# Tests the feature provider module.
+AUDIO_PROVIDER_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h \
+
 FEATURE_PROVIDER_TEST_SRCS := \
 tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
-ALL_SRCS += $(FEATURE_PROVIDER_TEST_SRCS)
-FEATURE_PROVIDER_TEST_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(FEATURE_PROVIDER_TEST_SRCS))))
-FEATURE_PROVIDER_TEST_BINARY := $(BINDIR)feature_provider_test
-ALL_BINARIES += $(FEATURE_PROVIDER_TEST_BINARY)
-$(FEATURE_PROVIDER_TEST_BINARY): $(FEATURE_PROVIDER_TEST_OBJS) $(MICROLITE_LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(FEATURE_PROVIDER_TEST_BINARY) $(FEATURE_PROVIDER_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
-feature_provider_test: $(FEATURE_PROVIDER_TEST_BINARY)
-feature_provider_test_bin: $(FEATURE_PROVIDER_TEST_BINARY).bin
-test_feature_provider: $(FEATURE_PROVIDER_TEST_BINARY)
-	$(TEST_SCRIPT) $(FEATURE_PROVIDER_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
-
-# Tests the timer module.
-TIMER_TEST_SRCS := \
-tensorflow/lite/experimental/micro/examples/micro_speech/timer_test.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc
-ALL_SRCS += $(TIMER_TEST_SRCS)
-TIMER_TEST_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(TIMER_TEST_SRCS))))
-TIMER_TEST_BINARY := $(BINDIR)timer_test
-ALL_BINARIES += $(TIMER_TEST_BINARY)
-$(TIMER_TEST_BINARY): $(TIMER_TEST_OBJS) $(MICROLITE_LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(TIMER_TEST_BINARY) $(TIMER_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
-timer_test: $(TIMER_TEST_BINARY)
-timer_test_bin: $(TIMER_TEST_BINARY).bin
-test_timer: $(TIMER_TEST_BINARY)
-	$(TEST_SCRIPT) $(TIMER_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
 
-# Builds a standalone speech command recognizer binary.
+FEATURE_PROVIDER_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h
+
+RECOGNIZE_COMMANDS_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc
+
+RECOGNIZE_COMMANDS_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h
+
 MICRO_SPEECH_SRCS := \
 tensorflow/lite/experimental/micro/examples/micro_speech/main.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc
-ALL_SRCS += $(MICRO_SPEECH_SRCS)
-MICRO_SPEECH_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICRO_SPEECH_SRCS))))
-MICRO_SPEECH_BINARY := $(BINDIR)micro_speech
-ALL_BINARIES += $(MICRO_SPEECH_BINARY)
-$(MICRO_SPEECH_BINARY): $(MICRO_SPEECH_OBJS) $(MICROLITE_LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(MICRO_SPEECH_BINARY) $(MICRO_SPEECH_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
-micro_speech: $(MICRO_SPEECH_BINARY)
-micro_speech_bin: $(MICRO_SPEECH_BINARY).bin
+tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc
+
+MICRO_SPEECH_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h
+
+# Find any platform-specific rules for this example.
+include $(wildcard tensorflow/lite/experimental/micro/examples/micro_speech/*/Makefile.inc)
+
+# Tests loading and running a speech model.
+$(eval $(call microlite_test,micro_speech_test,\
+$(MICRO_SPEECH_TEST_SRCS),$(MICRO_SPEECH_TEST_HDRS)))
+
+# Test the code for feature generation.
+$(eval $(call microlite_test,preprocessor_test,\
+$(PREPROCESSOR_TEST_SRCS), $(PREPROCESSOR_TEST_HDRS)))
+
+# Tests the audio provider module.
+$(eval $(call microlite_test,audio_provider_test,\
+$(AUDIO_PROVIDER_TEST_SRCS),$(AUDIO_PROVIDER_TEST_HDRS)))
+
+# Tests the feature provider module.
+$(eval $(call microlite_test,feature_provider_test,\
+$(FEATURE_PROVIDER_TEST_SRCS),$(FEATURE_PROVIDER_TEST_HDRS)))
+
+# Tests the feature provider module.
+$(eval $(call microlite_test,recognize_commands_test,\
+$(RECOGNIZE_COMMANDS_TEST_SRCS),$(RECOGNIZE_COMMANDS_TEST_HDRS)))
+
+# Builds a standalone speech command recognizer binary.
+$(eval $(call microlite_test,micro_speech,\
+$(MICRO_SPEECH_SRCS),$(MICRO_SPEECH_HDRS)))
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/.gitignore b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..cb8d4d02c418e5d8c903c69729e8e1b3ee44a8bf
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/.gitignore
@@ -0,0 +1,4 @@
+captured_data.txt
+captured_data.wav
+cmsis_*.txt
+micro_*.txt
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/Makefile.inc b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/Makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..0aa362be0038f8757387a6311021e183dc19dabd
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/Makefile.inc
@@ -0,0 +1,100 @@
+# Settings for apollo3 evb platforms.
+ifeq ($(TARGET), apollo3evb)
+
+  PUSHBUTTON_MICRO_SPEECH_TEST_SRCS := \
+    $(AP3_MICRO_DIR)/../preprocessor.cc \
+    $(AP3_MICRO_DIR)/pushbutton_main.c \
+    $(AP3_MICRO_DIR)/pushbutton_test.cc \
+    $(AP3_MICRO_DIR)/../tiny_conv_model_data.cc \
+    $(APOLLO3_SDK)/devices/am_devices_led.c
+  ALL_SRCS += $(PUSHBUTTON_MICRO_SPEECH_TEST_SRCS)
+  PUSHBUTTON_MICRO_SPEECH_TEST_OBJS := $(addprefix $(OBJDIR), \
+    $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PUSHBUTTON_MICRO_SPEECH_TEST_SRCS))))
+  PUSHBUTTON_MICRO_SPEECH_TEST_BINARY := $(BINDIR)pushbutton_micro_speech_test
+  $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY): $(PUSHBUTTON_MICRO_SPEECH_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY) $(PUSHBUTTON_MICRO_SPEECH_TEST_OBJS) \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+  pushbutton_micro_speech_test: $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY)
+  pushbutton_micro_speech_test_bin: $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY).bin
+  test_pushbutton_micro_speech: $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY)
+	$(TEST_SCRIPT) $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+
+  PUSHBUTTON_CMSIS_SPEECH_TEST_SRCS := \
+    $(AP3_MICRO_DIR)/pushbutton_main.c \
+    $(AP3_MICRO_DIR)/pushbutton_test.cc \
+    $(AP3_MICRO_DIR)/../tiny_conv_model_data.cc \
+    $(CMSIS_DIR)/preprocessor.cc \
+    $(CMSIS_EXT_DIR)/arm_cmplx_mag_squared_q10p6.c \
+    $(CMSIS_DIR)/hanning.c \
+    $(APOLLO3_SDK)/devices/am_devices_led.c \
+    $(CMSIS_SRCS)
+  ALL_SRCS += $(PUSHBUTTON_CMSIS_SPEECH_TEST_SRCS)
+  PUSHBUTTON_CMSIS_SPEECH_TEST_OBJS := $(addprefix $(OBJDIR), \
+    $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PUSHBUTTON_CMSIS_SPEECH_TEST_SRCS))) \
+    arm_bitreversal2.o)
+  PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY := $(BINDIR)pushbutton_cmsis_speech_test
+  $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY): $(PUSHBUTTON_CMSIS_SPEECH_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY) $(PUSHBUTTON_CMSIS_SPEECH_TEST_OBJS) \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+  pushbutton_cmsis_speech_test: $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY)
+  pushbutton_cmsis_speech_test_bin: $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY).bin
+  test_pushbutton_cmsis_speech: $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY)
+	$(TEST_SCRIPT) $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+
+  PREPROCESSOR_1K_SRCS := \
+    tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k.cc \
+    tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.cc
+
+  PREPROCESSOR_1K_MICRO_TEST_SRCS := \
+    $(PREPROCESSOR_1K_SRCS) \
+    $(AP3_MICRO_DIR)/../fixed_point/preprocessor.cc \
+    $(AP3_EXT_MICRO_DIR)/system_apollo3.c \
+    $(AP3_MICRO_DIR)/_main.c
+  ALL_SRCS += $(PREPROCESSOR_1K_MICRO_TEST_SRCS)
+  PREPROCESSOR_1K_MICRO_TEST_OBJS := $(addprefix $(OBJDIR), \
+    $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_1K_MICRO_TEST_SRCS))))
+  PREPROCESSOR_1K_MICRO_TEST_BINARY := $(BINDIR)preprocessor_1k_micro_test
+  $(PREPROCESSOR_1K_MICRO_TEST_BINARY): $(PREPROCESSOR_1K_MICRO_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(PREPROCESSOR_1K_MICRO_TEST_BINARY) $(PREPROCESSOR_1K_MICRO_TEST_OBJS) \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+  preprocessor_1k_micro_test: $(PREPROCESSOR_1K_MICRO_TEST_BINARY)
+  preprocessor_1k_micro_test_bin: $(PREPROCESSOR_1K_MICRO_TEST_BINARY).bin
+  test_preprocessor_1k_micro: $(PREPROCESSOR_1K_MICRO_TEST_BINARY)
+	$(TEST_SCRIPT) $(PREPROCESSOR_1K_MICRO_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+
+  PREPROCESSOR_1K_CMSIS_TEST_SRCS := \
+    $(PREPROCESSOR_1K_SRCS) \
+    $(CMSIS_DIR)/preprocessor.cc \
+    $(CMSIS_EXT_DIR)/arm_cmplx_mag_squared_q10p6.c \
+    $(CMSIS_DIR)/hanning.c \
+    $(AP3_EXT_MICRO_DIR)/system_apollo3.c \
+    $(AP3_MICRO_DIR)/_main.c \
+    $(CMSIS_SRCS)
+  ALL_SRCS += $(PREPROCESSOR_1K_CMSIS_TEST_SRCS)
+  PREPROCESSOR_1K_CMSIS_TEST_BINARY := $(BINDIR)preprocessor_1k_cmsis_test
+  PREPROCESSOR_1K_CMSIS_TEST_OBJS := $(addprefix $(OBJDIR), \
+    $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_1K_CMSIS_TEST_SRCS)))\
+    arm_bitreversal2.o)
+  $(PREPROCESSOR_1K_CMSIS_TEST_BINARY): $(PREPROCESSOR_1K_CMSIS_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(PREPROCESSOR_1K_CMSIS_TEST_BINARY) $(PREPROCESSOR_1K_CMSIS_TEST_OBJS) \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+  preprocessor_1k_cmsis_test: $(PREPROCESSOR_1K_CMSIS_TEST_BINARY)
+  preprocessor_1k_cmsis_test_bin: $(PREPROCESSOR_1K_CMSIS_TEST_BINARY).bin
+  test_preprocessor_1k_cmsis: $(PREPROCESSOR_1K_CMSIS_TEST_BINARY)
+	$(TEST_SCRIPT) $(PREPROCESSOR_1K_CMSIS_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+
+  PREPROCESSOR_TEST_SRCS += \
+    $(AP3_MICRO_DIR)/_main.c 
+
+  $(OBJDIR)arm_bitreversal2.o:
+	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $(CMSIS_SRC_DIR)/TransformFunctions/arm_bitreversal2.S -o $(OBJDIR)arm_bitreversal2.o
+
+endif
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/README.md b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..10be9f136a9088d1ad098d685791ae357e8a9c22
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/README.md
@@ -0,0 +1,129 @@
+# Description of Apollo3 Makefile targets
+
+*   **pushbutton_cmsis_speech_test_bin**:
+    *   When users press BTN2 on the Apollo3 EVK, 1 second of audio is captured.
+    *   Then the audio is sent to the CMSIS version of the preprocessor and into
+        the neural net
+    *   To print out the neural net's inference scores, run GDB and source
+        pushbutton\_cmsis\_scores.cmd
+    *   To save the captured audio to a text file (captured\_data.txt), run GDB
+        and source pushbutton\_cmsis\_voice.cmd
+    *   Setup python
+        *   sudo apt install python-pip
+        *   sudo apt install python-tk
+        *   pip install numpy
+        *   pip install matplotlib
+        *   pip install pysoundfile
+        *   python captured_data_to_wav.py
+    *   captured\_data.txt can be turned into a \*.wav file using
+        captured\_data\_to\_wav.py by executing "python
+        captured\_data\_to\_wav.py"
+*   **preprocessor_1k_cmsis_test_bin**:
+    *   Sends a 1 kHz sine wave to the CMSIS fixed\_point version of the
+        preprocessor
+    *   **This test should be compiled with the -O0 option.** Otherwise, the
+        breakpoints will not be reached
+        *   In
+            tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
+            change "-O3" to "-O0" on line 47
+        *   **DO NOT FORGET TO REVERT CHANGE AFTER EXPERIMENT**
+        *   In future, enhance scripts to handle automatically, NOT manually!
+    *   Clean project by running "make -f
+        tensorflow/lite/experimental/micro/tools/make/Makefile clean"
+    *   Compile BIN by running "make -f
+        tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=apollo3evb
+        preprocessor_1k_cmsis_test_bin"
+    *   Run with the preprocessor\_1k\_cmsis\_test.cmd GDB command file
+    *   Produces four text files corresponding to outputs from the CMSIS
+        fixed\_point version of this algorithm:
+        *   cmsis_windowed_input.txt: the sinusoid after multiplying elementwise
+            with a Hann window
+        *   cmsis_dft.txt: the DFT of the windowed sinusoid
+        *   cmsis_power.txt: the magnitude squared of the DFT
+        *   cmsis_power_avg.txt: the 6-bin average of the magnitude squared of
+            the DFT
+    *   Run both verisons of the 1KHz pre-processor test and then compare.
+        *   These files can be plotted with "python compare\_1k.py"
+    *   Also prints out the number of cycles the code took to execute (using the
+        DWT->CYCCNT register)
+*   **preprocessor_1k_micro_test_bin**
+    *   Sends a 1 kHz sine wave to the Micro-Lite fixed\_point version of the
+        preprocessor
+    *   **This test should be compiled with the -O0 option.** Otherwise, the
+        breakpoints will not be reached
+    *   Run with the preprocessor\_1k\_micro\_test.cmd GDB command file
+    *   Produces four text files corresponding to outputs from the Micro-Lite
+        version of this algorithm:
+        *   micro_windowed_input.txt: the sinusoid after multiplying elementwise
+            with a Hann window
+        *   micro_dft.txt: the DFT of the windowed sinusoid
+        *   micro_power.txt: the magnitude squared of the DFT
+        *   micro_power_avg.txt: the 6-bin average of the magnitude squared of
+            the DFT
+    *   Run both verisons of the 1KHz pre-processor test and then compare.
+        *   These files can be plotted with "python compare\_1k.py"
+    *   Also prints out the number of cycles the code took to execute (using the
+        DWT->CYCCNT register)
+
+# Description of files
+
+*   **.gitignore**: Git should ignore \*.txt and \*.wav files that result from
+    experiments run in this directory
+*   **captured\_data\_to\_wav.py**: Python script that parses a text file
+    containing data dumped from GDB (specifically the verilog format) and
+    creates a \*.wav file using
+    [PySoundFile](https://pysoundfile.readthedocs.io/en/0.9.0/).
+*   **compare\_1k.py**: This script compares the intermediate variables and
+    final outputs of the micro-lite fixed-point preprocessor function and the
+    CMSIS version of this function. The stimulus provided to each preprocessor
+    is the same: a 1 kHz sinusoid.
+*   **get\_yesno\_data.cmd**: A GDB command file that runs preprocessor_test
+    (where TARGET=apollo3evb) and dumps the calculated data for the "yes" and
+    "no" input wavfeorms to text files
+*   **\_main.c**: Point of entry for the micro_speech test
+*   **preprocessor_1k.cc**: A version of preprocessor.cc where a 1 kHz sinusoid
+    is provided as input to the preprocessor
+*   **preprocessor_1k_cmsis_test.cmd**: GDB command file for the CMSIS
+    preprocessor 1 kHz test
+*   **preprocessor_1k_micro_test.cmd**: GDB command file for the Micro-Lite
+    preprocessor 1 kHz test
+*   **preprocessor_test.cmd**: GDB command file for the preprocessor test
+*   **pushbutton_cmsis_scores.cmd**: GDB command file that runs
+    pushbutton_cmsis_speech_test_bin. It adds a breakpoint immediately after the
+    scores are reported and prints out each score. Then it continues code
+    execution.
+*   **pushbutton_cmsis_voice.cmd**: GDB command file that runs
+    pushbutton_cmsis_speech_test_bin. Dumps the recorded 1 second of audio to
+    captured_data.txt, which can then be processed by the python file
+    captured_data_to_wav.py.
+*   **pushbutton_main.c**: Source file containing program point of entry
+    \_main() for the pushbutton\_\* tests. Contains Interrupt Service Routines
+    for PDM data capture and pushbuttons. Calls the main() function of
+    pushbutton_test.cc
+*   **pushbutton_test.cc**: Source file containing main() function for the
+    pushbutton\_\* tests. main() calls the preprocessor function and the neural
+    net inference function.
+
+# Description of externally downloaded files in ../apollo3_ext
+
+*   **apollo3.h**: Apollo 3 version of the
+    [CMSIS Device Header File (device.h)](https://www.keil.com/pack/doc/CMSIS/Core/html/device_h_pg.html).
+    Available in the
+    [Ambiq Keil Pack](http://s3.ambiqmicro.com/pack/AmbiqMicro.Apollo_DFP.1.1.0.pack).
+*   **system_apollo3.c**: Apollo 3 version of the
+    [CMSIS System Configuration File system\_\<device\>.c](https://www.keil.com/pack/doc/CMSIS/Core/html/system_c_pg.html).
+    Available in the
+    [Ambiq Keil Pack](http://s3.ambiqmicro.com/pack/AmbiqMicro.Apollo_DFP.1.1.0.pack).
+*   **system_apollo3.h**: Apollo 3 version of the
+    [CMSIS System Configuration File system\_\<device\>.h](https://www.keil.com/pack/doc/CMSIS/Core/html/system_c_pg.html).
+    Available in the
+    [Ambiq Keil Pack](http://s3.ambiqmicro.com/pack/AmbiqMicro.Apollo_DFP.1.1.0.pack).
+
+# FFT scaling
+
+See https://github.com/ARM-software/CMSIS_5/issues/220
+
+> And as @xizhizhang pointed, I think there may be an error on the internal
+> downscaling, or at least on the documentation. It looks like during the fft
+> computation, the downscaling factor reach 2**-9 for a 512 rfft operation,
+> being the output in Q10.22, instead the documented 2**-8 and Q9.23.
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/_main.c b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/_main.c
new file mode 100644
index 0000000000000000000000000000000000000000..b49d5c50ffc936fd34115cc9150829b47a1e3ab5
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/_main.c
@@ -0,0 +1,117 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdint.h>
+#include "am_bsp.h"
+#include "am_mcu_apollo.h"  // Defines AM_CMSIS_REGS
+#include "am_util.h"
+
+//*****************************************************************************
+//
+// The entry point for the application.
+//
+//*****************************************************************************
+extern int main(int argc, char** argv);
+
+void DebugLog(const char* s) { am_util_stdio_printf("%s", s); }
+void DebugLogInt32(int32_t i) { am_util_stdio_printf("%d", i); }
+void DebugLogUInt32(uint32_t i) { am_util_stdio_printf("%d", i); }
+void DebugLogHex(uint32_t i) { am_util_stdio_printf("0x%8x", i); }
+void DebugLogFloat(float i) { am_util_stdio_printf("%f", i); }
+
+int _main(void) {
+  am_util_id_t sIdDevice;
+  uint32_t ui32StrBuf;
+
+  //
+  // Set the clock frequency.
+  //
+  am_hal_clkgen_control(AM_HAL_CLKGEN_CONTROL_SYSCLK_MAX, 0);
+
+  //
+  // Set the default cache configuration
+  //
+  am_hal_cachectrl_config(&am_hal_cachectrl_defaults);
+  am_hal_cachectrl_enable();
+
+  //
+  // Configure the board for low power operation.
+  //
+  am_bsp_low_power_init();
+
+  //
+  // Initialize the printf interface for UART output
+  //
+  am_bsp_uart_printf_enable();
+
+  //
+  // Print the banner.
+  //
+  am_util_stdio_terminal_clear();
+  am_util_stdio_printf("Hello World!\n\n");
+
+  //
+  // Print the device info.
+  //
+  am_util_id_device(&sIdDevice);
+  am_util_stdio_printf("Vendor Name: %s\n", sIdDevice.pui8VendorName);
+  am_util_stdio_printf("Device type: %s\n", sIdDevice.pui8DeviceName);
+
+  am_util_stdio_printf("Qualified: %s\n",
+                       sIdDevice.sMcuCtrlDevice.ui32Qualified ? "Yes" : "No");
+
+  am_util_stdio_printf(
+      "Device Info:\n"
+      "\tPart number: 0x%08X\n"
+      "\tChip ID0:    0x%08X\n"
+      "\tChip ID1:    0x%08X\n"
+      "\tRevision:    0x%08X (Rev%c%c)\n",
+      sIdDevice.sMcuCtrlDevice.ui32ChipPN, sIdDevice.sMcuCtrlDevice.ui32ChipID0,
+      sIdDevice.sMcuCtrlDevice.ui32ChipID1,
+      sIdDevice.sMcuCtrlDevice.ui32ChipRev, sIdDevice.ui8ChipRevMaj,
+      sIdDevice.ui8ChipRevMin);
+
+  //
+  // If not a multiple of 1024 bytes, append a plus sign to the KB.
+  //
+  ui32StrBuf = (sIdDevice.sMcuCtrlDevice.ui32FlashSize % 1024) ? '+' : 0;
+  am_util_stdio_printf(
+      "\tFlash size:  %7d (%d KB%s)\n", sIdDevice.sMcuCtrlDevice.ui32FlashSize,
+      sIdDevice.sMcuCtrlDevice.ui32FlashSize / 1024, &ui32StrBuf);
+
+  ui32StrBuf = (sIdDevice.sMcuCtrlDevice.ui32SRAMSize % 1024) ? '+' : 0;
+  am_util_stdio_printf(
+      "\tSRAM size:   %7d (%d KB%s)\n\n", sIdDevice.sMcuCtrlDevice.ui32SRAMSize,
+      sIdDevice.sMcuCtrlDevice.ui32SRAMSize / 1024, &ui32StrBuf);
+
+  //
+  // Print the compiler version.
+  //
+  am_util_stdio_printf("App Compiler:    %s\n", COMPILER_VERSION);
+#ifdef AM_PART_APOLLO3
+  am_util_stdio_printf("HAL Compiler:    %s\n", g_ui8HALcompiler);
+  am_util_stdio_printf("HAL SDK version: %d.%d.%d\n", g_ui32HALversion.s.Major,
+                       g_ui32HALversion.s.Minor, g_ui32HALversion.s.Revision);
+  am_util_stdio_printf("HAL compiled with %s-style registers\n",
+                       g_ui32HALversion.s.bAMREGS ? "AM_REG" : "CMSIS");
+
+  am_util_stdio_printf("&sIdDevice: 0x%x, &ui32StrBuf: 0x%x\n", &sIdDevice,
+                       &ui32StrBuf);
+  am_hal_security_info_t secInfo;
+  char sINFO[32];
+  uint32_t ui32Status;
+#endif  // AM_PART_APOLLO3
+  main(0, NULL);
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/captured_data_to_wav.py b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/captured_data_to_wav.py
new file mode 100644
index 0000000000000000000000000000000000000000..10a05b6dcf1bbd5c779f7ee7bdf4d01ebde76017
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/captured_data_to_wav.py
@@ -0,0 +1,46 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Converts values pulled from the microcontroller into audio files."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import struct
+# import matplotlib.pyplot as plt
+import numpy as np
+import soundfile as sf
+
+
+def new_data_to_array(fn):
+  vals = []
+  with open(fn) as f:
+    for n, line in enumerate(f):
+      if n is not 0:
+        vals.extend([int(v, 16) for v in line.split()])
+  b = ''.join(map(chr, vals))
+  y = struct.unpack('<' + 'h' * int(len(b) / 2), b)
+
+  return y
+
+
+data = 'captured_data.txt'
+values = np.array(new_data_to_array(data)).astype(float)
+
+# plt.plot(values, 'o-')
+# plt.show(block=False)
+
+wav = values / np.max(np.abs(values))
+sf.write('captured_data.wav', wav, 16000)
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/compare_1k.py b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/compare_1k.py
new file mode 100644
index 0000000000000000000000000000000000000000..52352bad94a1e5627a9ca35d07a5082b6d79e6a6
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/compare_1k.py
@@ -0,0 +1,167 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Debugging script for checking calculation values."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import struct
+import matplotlib.pyplot as plt
+import numpy as np
+
+# import soundfile as sf
+
+
+def new_data_to_array(fn, datatype='int16'):
+  """Converts file information to an in-memory array."""
+  vals = []
+  with open(fn) as f:
+    for n, line in enumerate(f):
+      if n is not 0:
+        vals.extend([int(v, 16) for v in line.split()])
+  b = ''.join(map(chr, vals))
+
+  if datatype == 'int8':
+    typestr = 'b'
+    arraylen = int(len(b))
+  elif datatype == 'int16':
+    typestr = 'h'
+    arraylen = int(len(b) // 2)
+  elif datatype == 'int32':
+    typestr = 'i'
+    arraylen = int(len(b) // 4)
+  if datatype == 'uint8':
+    typestr = 'B'
+    arraylen = int(len(b))
+  elif datatype == 'uint16':
+    typestr = 'H'
+    arraylen = int(len(b) // 2)
+  elif datatype == 'uint32':
+    typestr = 'I'
+    arraylen = int(len(b) // 4)
+
+  y = np.array(struct.unpack('<' + typestr * arraylen, b))
+
+  return y
+
+
+# x is the fixed-point input in Qm.n format
+def to_float(x, n):
+  return x.astype(float) * 2**(-n)
+
+
+micro_windowed_input = new_data_to_array(
+    'micro_windowed_input.txt', datatype='int32')
+cmsis_windowed_input = new_data_to_array(
+    'cmsis_windowed_input.txt', datatype='int16')
+
+micro_dft = new_data_to_array('micro_dft.txt', datatype='int32')
+cmsis_dft = new_data_to_array('cmsis_dft.txt', datatype='int16')
+py_dft = np.fft.rfft(to_float(cmsis_windowed_input, 15), n=512)
+py_result = np.empty((2 * py_dft.size), dtype=np.float)
+py_result[0::2] = np.real(py_dft)
+py_result[1::2] = np.imag(py_dft)
+
+micro_power = new_data_to_array('micro_power.txt', datatype='int32')
+cmsis_power = new_data_to_array('cmsis_power.txt', datatype='int16')
+py_power = np.square(np.abs(py_dft))
+
+micro_power_avg = new_data_to_array('micro_power_avg.txt', datatype='uint8')
+cmsis_power_avg = new_data_to_array('cmsis_power_avg.txt', datatype='uint8')
+
+plt.figure(1)
+plt.subplot(311)
+plt.plot(micro_windowed_input, label='Micro fixed')
+plt.legend()
+plt.subplot(312)
+plt.plot(cmsis_windowed_input, label='CMSIS fixed')
+plt.legend()
+plt.subplot(313)
+plt.plot(to_float(micro_windowed_input, 30), label='Micro to float')
+plt.plot(to_float(cmsis_windowed_input, 15), label='CMSIS to float')
+plt.legend()
+
+plt.figure(2)
+plt.subplot(311)
+plt.plot(micro_dft, label='Micro fixed')
+plt.legend()
+plt.subplot(312)
+plt.plot(cmsis_dft, label='CMSIS fixed')
+plt.legend()
+plt.subplot(313)
+plt.plot(to_float(micro_dft, 22), label='Micro to float')
+# CMSIS result has 6 fractionanl bits (not 7) due to documentation error (see
+# README.md)
+plt.plot(to_float(cmsis_dft, 6), label='CMSIS to float')
+plt.plot(py_result, label='Python result')
+plt.legend()
+
+plt.figure(3)
+plt.subplot(311)
+plt.plot(micro_power, label='Micro fixed')
+plt.legend()
+plt.subplot(312)
+plt.plot(cmsis_power[0:256], label='CMSIS fixed')
+plt.legend()
+plt.subplot(313)
+plt.plot(to_float(micro_power, 22), label='Micro to float')
+plt.plot(to_float(cmsis_power[0:256], 6), label='CMSIS to float')
+plt.plot(py_power, label='Python result')
+plt.legend()
+
+plt.figure(4)
+plt.plot(micro_power_avg, label='Micro fixed')
+plt.plot(cmsis_power_avg, label='CMSIS fixed')
+plt.legend()
+plt.show()
+
+# t = np.arange(16000.*0.03)/16000.
+# # Factor of 10 because micro preprocessing overflows otherwise
+# sin1k = 0.1*np.sin(2*np.pi*1000*t)
+#
+# plt.figure(1)
+# plt.subplot(511)
+# plt.plot(sin1k)
+# plt.title('Input sine')
+#
+# plt.subplot(512)
+# plt.plot(to_float(micro_windowed_input, 30), label='Micro-Lite')
+# plt.plot(to_float(cmsis_windowed_input, 15), label='CMSIS')
+# plt.title('Windowed sine')
+# plt.legend(loc='center right')
+#
+# plt.subplot(513)
+# plt.plot(to_float(micro_dft, 22), label='Micro-Lite')
+# plt.plot(to_float(cmsis_dft, 6), label='CMSIS')
+# plt.title('FFT')
+# plt.legend(loc='center')
+#
+# plt.subplot(514)
+# plt.plot(to_float(micro_power, 22), label='Micro-Lite')
+# plt.plot(to_float(cmsis_power[0:256], 6), label='CMSIS')
+# plt.title('|FFT|^2')
+# plt.legend(loc='center right')
+#
+# plt.subplot(515)
+# plt.plot(micro_power_avg, label='Micro-Lite')
+# plt.plot(cmsis_power_avg, label='CMSIS')
+# plt.title('Averaged |FFT|^2')
+# plt.legend(loc='center right')
+#
+# plt.tight_layout(pad=0, w_pad=0.2, h_pad=0.2)
+#
+# plt.show()
+#
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k.cc
new file mode 100644
index 0000000000000000000000000000000000000000..007772e77a53b43607be90e6b8b9243b00c79546
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k.cc
@@ -0,0 +1,53 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* This file is a modification of the Tensorflow Micro Lite file preprocessor.cc
+ */
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+extern "C" {
+#include "apollo3.h"
+#include "system_apollo3.h"
+}
+
+#define output_data_size 43
+int count;
+
+extern TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
+                               const int16_t* input, int input_size,
+                               int output_size, uint8_t* output);
+
+TF_LITE_MICRO_TESTS_BEGIN
+CoreDebug->DEMCR |= CoreDebug_DEMCR_TRCENA_Msk;
+// DWT->LAR = 0xC5ACCE55;
+DWT->CYCCNT = 0;
+DWT->CTRL |= DWT_CTRL_CYCCNTENA_Msk;
+
+TF_LITE_MICRO_TEST(TestPreprocessor) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  uint8_t calculated_data[output_data_size];
+  TfLiteStatus yes_status = Preprocess(error_reporter, g_sin_1k, g_sin_1k_size,
+                                       output_data_size, calculated_data);
+  count = DWT->CYCCNT;
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, yes_status);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k_cmsis_test.cmd b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k_cmsis_test.cmd
new file mode 100644
index 0000000000000000000000000000000000000000..6988057f37fc8ecfa89bf8e4d87b665be540cb2e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k_cmsis_test.cmd
@@ -0,0 +1,37 @@
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Needs to be compiled with -O0
+file ../../../tools/make/gen/apollo3evb_cortex-m4/bin/preprocessor_1k_cmsis_test
+target remote localhost:2331
+load ../../../tools/make/gen/apollo3evb_cortex-m4/bin/preprocessor_1k_cmsis_test
+monitor reset
+break preprocessor.cc:68
+commands
+dump verilog value cmsis_windowed_input.txt bufB
+c
+end
+break preprocessor.cc:76
+commands
+dump verilog value cmsis_dft.txt bufA
+c
+end
+break preprocessor.cc:81
+commands
+dump verilog value cmsis_power.txt bufB
+c
+end
+break preprocessor.cc:83
+commands
+dump verilog memory cmsis_power_avg.txt output output+42
+c
+end
+break preprocessor_1k.cc:50
+commands
+print count
+end
+c
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k_micro_test.cmd b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k_micro_test.cmd
new file mode 100644
index 0000000000000000000000000000000000000000..dc9cd4f0a41b20a50d487da8c68fa93b35439e38
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k_micro_test.cmd
@@ -0,0 +1,25 @@
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Needs to be run when compiled with -O0
+file ../../../tools/make/gen/apollo3evb_cortex-m4/bin/preprocessor_1k_micro_test
+target remote localhost:2331
+load ../../../tools/make/gen/apollo3evb_cortex-m4/bin/preprocessor_1k_micro_test
+monitor reset
+break preprocessor.cc:211
+commands
+dump verilog value micro_windowed_input.txt fixed_input
+dump verilog value micro_dft.txt fourier_values
+dump verilog value micro_power.txt power_spectrum
+dump verilog memory micro_power_avg.txt output output+42
+c
+end
+break preprocessor_1k.cc:50
+commands
+print count
+end
+c
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_test.cmd b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_test.cmd
new file mode 100644
index 0000000000000000000000000000000000000000..bd2048e80ae3dffc5b6650d730c96b617a1379f9
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_test.cmd
@@ -0,0 +1,11 @@
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+file ../../gen/apollo3evb_cortex-m4/bin/preprocessor_test
+target remote localhost:2331
+load ../../gen/apollo3evb_cortex-m4/bin/preprocessor_test
+monitor reset
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_cmsis_scores.cmd b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_cmsis_scores.cmd
new file mode 100644
index 0000000000000000000000000000000000000000..ace278ff9a2e20f51590dd9fd5d66b84e65c023b
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_cmsis_scores.cmd
@@ -0,0 +1,26 @@
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+file ../../../tools/make/gen/apollo3evb_cortex-m4/bin/pushbutton_cmsis_speech_test
+target remote localhost:2331
+load ../../../tools/make/gen/apollo3evb_cortex-m4/bin/pushbutton_cmsis_speech_test
+monitor reset
+break pushbutton_main.c:307
+commands
+printf "Silence score: %d\n", g_silence_score
+printf "Unknown score: %d\n", g_unknown_score
+printf "Yes score: %d\n", g_yes_score
+printf "No score: %d\n", g_no_score
+printf "g_scores[0]: %d\n", g_scores[0]
+printf "g_scores[1]: %d\n", g_scores[1]
+printf "g_scores[2]: %d\n", g_scores[2]
+printf "g_scores[3]: %d\n", g_scores[3]
+printf "max_score: %d\n", max_score
+printf "max_score_index: %d\n", max_score_index
+c
+end
+c
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_cmsis_voice.cmd b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_cmsis_voice.cmd
new file mode 100644
index 0000000000000000000000000000000000000000..5dea48e62aba123b54a19c02847236cf28fc2a38
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_cmsis_voice.cmd
@@ -0,0 +1,25 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+file ../../../tools/make/gen/apollo3evb_cortex-m4/bin/pushbutton_cmsis_speech_test
+target remote localhost:2331
+load ../../../tools/make/gen/apollo3evb_cortex-m4/bin/pushbutton_cmsis_speech_test
+monitor reset
+break pushbutton_main.c:296
+commands
+dump verilog value captured_data.txt captured_data
+c
+end
+c
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_main.c b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_main.c
new file mode 100644
index 0000000000000000000000000000000000000000..afee38343b3fac81de945dcd01b53ad35e8be270
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_main.c
@@ -0,0 +1,322 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* This file is a modification of the Tensorflow Micro Lite file _main.c */
+
+#include <stdint.h>
+#include "am_bsp.h"
+#include "am_mcu_apollo.h"  // Defines AM_CMSIS_REGS
+#include "am_util.h"
+
+#define ARM_MATH_CM4
+#include <arm_math.h>
+
+//*****************************************************************************
+// Parameters
+//
+// Total number of bytes transferred = 320*50*2 = 32000
+//*****************************************************************************
+
+#define FRAME_SIZE 320  // Capture one 320-sample (20-ms) frame at a time
+#define NUM_FRAMES 50   // Number of frames in 1 second
+
+//*****************************************************************************
+// GLOBALS
+//*****************************************************************************
+
+volatile int16_t g_numFramesCaptured = 0;
+volatile bool g_bPDMDataReady = false;
+int16_t
+    captured_data[FRAME_SIZE * NUM_FRAMES];  // Location of 1-second data buffer
+extern uint8_t g_silence_score;
+extern uint8_t g_unknown_score;
+extern uint8_t g_yes_score;
+extern uint8_t g_no_score;
+q7_t g_scores[4] = {0};
+
+//*****************************************************************************
+// The entry point for the application.
+//*****************************************************************************
+extern int main(int argc, char** argv);
+
+void DebugLog(const char* s) { am_util_stdio_printf("%s", s); }
+void DebugLogInt32(int32_t i) { am_util_stdio_printf("%d", i); }
+void DebugLogUInt32(uint32_t i) { am_util_stdio_printf("%d", i); }
+void DebugLogHex(uint32_t i) { am_util_stdio_printf("0x%8x", i); }
+void DebugLogFloat(float i) { am_util_stdio_printf("%f", i); }
+
+//*****************************************************************************
+// PDM configuration information.
+//*****************************************************************************
+void* PDMHandle;
+
+am_hal_pdm_config_t g_sPdmConfig = {
+    .eClkDivider = AM_HAL_PDM_MCLKDIV_1,
+    .eLeftGain = AM_HAL_PDM_GAIN_P225DB,
+    .eRightGain = AM_HAL_PDM_GAIN_P225DB,
+    .ui32DecimationRate =
+        48,  // OSR = 1500/16 = 96 = 2*SINCRATE --> SINC_RATE = 48
+    .bHighPassEnable = 0,
+    .ui32HighPassCutoff = 0xB,
+    .ePDMClkSpeed = AM_HAL_PDM_CLK_1_5MHZ,
+    .bInvertI2SBCLK = 0,
+    .ePDMClkSource = AM_HAL_PDM_INTERNAL_CLK,
+    .bPDMSampleDelay = 0,
+    .bDataPacking = 1,
+    .ePCMChannels = AM_HAL_PDM_CHANNEL_RIGHT,
+    .bLRSwap = 0,
+};
+
+//*****************************************************************************
+// BUTTON0 pin configuration settings.
+//*****************************************************************************
+const am_hal_gpio_pincfg_t g_deepsleep_button0 = {
+    .uFuncSel = 3,
+    .eIntDir = AM_HAL_GPIO_PIN_INTDIR_LO2HI,
+    .eGPInput = AM_HAL_GPIO_PIN_INPUT_ENABLE,
+};
+
+//*****************************************************************************
+// PDM initialization.
+//*****************************************************************************
+void pdm_init(void) {
+  //
+  // Initialize, power-up, and configure the PDM.
+  //
+  am_hal_pdm_initialize(0, &PDMHandle);
+  am_hal_pdm_power_control(PDMHandle, AM_HAL_PDM_POWER_ON, false);
+  am_hal_pdm_configure(PDMHandle, &g_sPdmConfig);
+  am_hal_pdm_enable(PDMHandle);
+
+  //
+  // Configure the necessary pins.
+  //
+  am_hal_gpio_pincfg_t sPinCfg = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  // ARPIT 181019
+  // sPinCfg.uFuncSel = AM_HAL_PIN_10_PDMCLK;
+  // am_hal_gpio_pinconfig(10, sPinCfg);
+  sPinCfg.uFuncSel = AM_HAL_PIN_12_PDMCLK;
+  am_hal_gpio_pinconfig(12, sPinCfg);
+
+  sPinCfg.uFuncSel = AM_HAL_PIN_11_PDMDATA;
+  am_hal_gpio_pinconfig(11, sPinCfg);
+
+  // am_hal_gpio_state_write(14, AM_HAL_GPIO_OUTPUT_CLEAR);
+  // am_hal_gpio_pinconfig(14, g_AM_HAL_GPIO_OUTPUT);
+
+  //
+  // Configure and enable PDM interrupts (set up to trigger on DMA
+  // completion).
+  //
+  am_hal_pdm_interrupt_enable(PDMHandle,
+                              (AM_HAL_PDM_INT_DERR | AM_HAL_PDM_INT_DCMP |
+                               AM_HAL_PDM_INT_UNDFL | AM_HAL_PDM_INT_OVF));
+
+#if AM_CMSIS_REGS
+  NVIC_EnableIRQ(PDM_IRQn);
+#else
+  am_hal_interrupt_enable(AM_HAL_INTERRUPT_PDM);
+#endif
+}
+
+//*****************************************************************************
+//
+// Start a transaction to get some number of bytes from the PDM interface.
+//
+//*****************************************************************************
+void pdm_data_get(void) {
+  //
+  // Configure DMA and target address.
+  //
+  am_hal_pdm_transfer_t sTransfer;
+  sTransfer.ui32TargetAddr =
+      (uint32_t)(&captured_data[FRAME_SIZE * g_numFramesCaptured]);
+  sTransfer.ui32TotalCount = 2 * FRAME_SIZE;  // Each sample is 2 bytes
+
+  //
+  // Start the data transfer.
+  //
+  am_hal_pdm_dma_start(PDMHandle, &sTransfer);
+}
+
+//*****************************************************************************
+//
+// PDM interrupt handler.
+//
+//*****************************************************************************
+void am_pdm_isr(void) {
+  uint32_t ui32Status;
+  //
+  // Read the interrupt status.
+  //
+  am_hal_pdm_interrupt_status_get(PDMHandle, &ui32Status, true);
+  am_hal_pdm_interrupt_clear(PDMHandle, ui32Status);
+
+  //
+  // Once our DMA transaction completes, send a flag to the main routine
+  //
+  if (ui32Status & AM_HAL_PDM_INT_DCMP) g_bPDMDataReady = true;
+}
+
+//*****************************************************************************
+// GPIO ISR
+// Will enable the PDM, set number of frames transferred to 0, and turn on LED
+//*****************************************************************************
+void am_gpio_isr(void) {
+  //
+  // Delay for debounce.
+  //
+  am_util_delay_ms(200);
+
+  //
+  // Clear the GPIO Interrupt (write to clear).
+  //
+  am_hal_gpio_interrupt_clear(AM_HAL_GPIO_BIT(AM_BSP_GPIO_BUTTON0));
+
+  // Start audio transfer
+  am_hal_pdm_fifo_flush(PDMHandle);
+  pdm_data_get();
+  am_hal_pdm_enable(PDMHandle);
+
+  //
+  // Turn on LED 0
+  //
+  am_devices_led_on(am_bsp_psLEDs, 0);
+}
+
+int _main(void) {
+  am_util_id_t sIdDevice;
+  uint32_t ui32StrBuf;
+
+  //
+  // Set the clock frequency.
+  //
+  am_hal_clkgen_control(AM_HAL_CLKGEN_CONTROL_SYSCLK_MAX, 0);
+
+  //
+  // Set the default cache configuration
+  //
+  am_hal_cachectrl_config(&am_hal_cachectrl_defaults);
+  am_hal_cachectrl_enable();
+
+  //
+  // Configure the board for low power operation.
+  //
+  am_bsp_low_power_init();
+
+#if defined(AM_BSP_NUM_BUTTONS) && defined(AM_BSP_NUM_LEDS)
+  //
+  // Configure the button pin.
+  //
+  am_hal_gpio_pinconfig(AM_BSP_GPIO_BUTTON0, g_deepsleep_button0);
+
+  //
+  // Clear the GPIO Interrupt (write to clear).
+  //
+  am_hal_gpio_interrupt_clear(AM_HAL_GPIO_BIT(AM_BSP_GPIO_BUTTON0));
+
+  //
+  // Enable the GPIO/button interrupt.
+  //
+  am_hal_gpio_interrupt_enable(AM_HAL_GPIO_BIT(AM_BSP_GPIO_BUTTON0));
+
+  //
+  // Configure the LEDs.
+  //
+  am_devices_led_array_init(am_bsp_psLEDs, AM_BSP_NUM_LEDS);
+
+  //
+  // Turn the LEDs off
+  //
+  for (int ix = 0; ix < AM_BSP_NUM_LEDS; ix++) {
+    am_devices_led_off(am_bsp_psLEDs, ix);
+  }
+
+//    am_devices_led_on(am_bsp_psLEDs, 1);
+#endif  // defined(AM_BSP_NUM_BUTTONS)  &&  defined(AM_BSP_NUM_LEDS)
+
+#if AM_CMSIS_REGS
+  NVIC_EnableIRQ(GPIO_IRQn);
+#else   // AM_CMSIS_REGS
+  am_hal_interrupt_enable(AM_HAL_INTERRUPT_GPIO);
+#endif  // AM_CMSIS_REGS
+
+  //
+  // Enable interrupts to the core.
+  //
+  am_hal_interrupt_master_enable();
+
+  // Turn on PDM
+  pdm_init();
+
+  //
+  // Initialize the printf interface for UART output
+  //
+  am_bsp_uart_printf_enable();
+
+  //
+  // Print the banner.
+  //
+  am_util_stdio_terminal_clear();
+  am_util_stdio_printf("Starting streaming test\n\n");
+
+  // Score variables
+  q7_t max_score = 0;
+  uint32_t max_score_index = 0;
+
+  while (1) {
+    am_hal_interrupt_master_disable();
+
+    if (g_bPDMDataReady) {
+      g_bPDMDataReady = false;
+      g_numFramesCaptured++;
+
+      if (g_numFramesCaptured < NUM_FRAMES) {
+        pdm_data_get();  // Start converting the next set of PCM samples.
+      }
+
+      else {
+        g_numFramesCaptured = 0;
+        // am_hal_pdm_disable(PDMHandle);
+        am_devices_led_off(am_bsp_psLEDs, 0);
+
+        main(0, NULL);
+
+        g_scores[0] = (q7_t)g_silence_score - 128;
+        g_scores[1] = (q7_t)g_unknown_score - 128;
+        g_scores[2] = (q7_t)g_yes_score - 128;
+        g_scores[3] = (q7_t)g_no_score - 128;
+
+        am_devices_led_off(
+            am_bsp_psLEDs,
+            max_score_index + 1);  // Turn off LED for previous max score
+        arm_max_q7(g_scores, 4, &max_score, &max_score_index);
+        am_devices_led_on(
+            am_bsp_psLEDs,
+            max_score_index + 1);  // Turn on LED for new max score
+      }
+    }
+
+    //
+    // Go to Deep Sleep.
+    //
+    am_hal_sysctrl_sleep(AM_HAL_SYSCTRL_SLEEP_DEEP);
+
+    am_hal_interrupt_master_enable();
+  }
+
+  // main(0, NULL);
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..95043f857b34b953c91a762bc1a54e9489431bff
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_test.cc
@@ -0,0 +1,116 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* This file is a modification of the Tensorflow Micro Lite file
+ * micro_speech_test.cc */
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/micro_interpreter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+extern int16_t captured_data[16000];
+uint8_t g_silence_score = 0;
+uint8_t g_unknown_score = 0;
+uint8_t g_yes_score = 0;
+uint8_t g_no_score = 0;
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestPreprocessor) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  uint8_t preprocessed_data[43 * 49];
+  TfLiteStatus preprocess_1sec_status =
+      Preprocess_1sec(error_reporter, captured_data, preprocessed_data);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, preprocess_1sec_status);
+
+  // Map the model into a usable data structure. This doesn't involve any
+  // copying or parsing, it's a very lightweight operation.
+  const tflite::Model* model = ::tflite::GetModel(g_tiny_conv_model_data);
+  if (model->version() != TFLITE_SCHEMA_VERSION) {
+    error_reporter->Report(
+        "Model provided is schema version %d not equal "
+        "to supported version %d.\n",
+        model->version(), TFLITE_SCHEMA_VERSION);
+  }
+
+  // This pulls in all the operation implementations we need.
+  tflite::ops::micro::AllOpsResolver resolver;
+
+  // Create an area of memory to use for input, output, and intermediate arrays.
+  const int tensor_arena_size = 10 * 1024;
+  uint8_t tensor_arena[tensor_arena_size];
+  tflite::SimpleTensorAllocator tensor_allocator(tensor_arena,
+                                                 tensor_arena_size);
+
+  // Build an interpreter to run the model with.
+  tflite::MicroInterpreter interpreter(model, resolver, &tensor_allocator,
+                                       error_reporter);
+
+  // Get information about the memory area to use for the model's input.
+  TfLiteTensor* input = interpreter.input(0);
+
+  // Make sure the input has the properties we expect.
+  TF_LITE_MICRO_EXPECT_NE(nullptr, input);
+  TF_LITE_MICRO_EXPECT_EQ(4, input->dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(1, input->dims->data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(49, input->dims->data[1]);
+  TF_LITE_MICRO_EXPECT_EQ(43, input->dims->data[2]);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, input->type);
+
+  // Copy a spectrogram created from a .wav audio file of someone saying "Yes",
+  // into the memory area used for the input.
+  for (int i = 0; i < input->bytes; ++i) {
+    input->data.uint8[i] = preprocessed_data[i];
+  }
+
+  // Run the model on this input and make sure it succeeds.
+  TfLiteStatus invoke_status = interpreter.Invoke();
+  if (invoke_status != kTfLiteOk) {
+    error_reporter->Report("Invoke failed\n");
+  }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
+
+  // Get the output from the model, and make sure it's the expected size and
+  // type.
+  TfLiteTensor* output = interpreter.output(0);
+  TF_LITE_MICRO_EXPECT_EQ(2, output->dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(1, output->dims->data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(4, output->dims->data[1]);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, output->type);
+
+  // There are four possible classes in the output, each with a score.
+  const int kSilenceIndex = 0;
+  const int kUnknownIndex = 1;
+  const int kYesIndex = 2;
+  const int kNoIndex = 3;
+
+  // Make sure that the expected "Yes" score is higher than the other classes.
+  g_silence_score = output->data.uint8[kSilenceIndex];
+  g_unknown_score = output->data.uint8[kUnknownIndex];
+  g_yes_score = output->data.uint8[kYesIndex];
+  g_no_score = output->data.uint8[kNoIndex];
+
+  error_reporter->Report("Ran successfully\n");
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
index c0365d56901b503628b323a2fe09a4fa0de9165e..52db18e6868371afc0b7cd39f6f41d0d60b91689 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 namespace {
 int16_t g_dummy_audio_data[kMaxAudioSampleSize];
+int32_t g_latest_audio_timestamp = 0;
 }  // namespace
 
 TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
@@ -31,3 +32,8 @@ TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
   *audio_samples = g_dummy_audio_data;
   return kTfLiteOk;
 }
+
+int32_t LatestAudioTimestamp() {
+  g_latest_audio_timestamp += 100;
+  return g_latest_audio_timestamp;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h
index 7e2442a5e83ee1f809f82587c816adb01dc09e5e..b69067364198d7285d3f2bfc34208168effacb35 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h
@@ -33,4 +33,14 @@ TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
                              int start_ms, int duration_ms,
                              int* audio_samples_size, int16_t** audio_samples);
 
+// Returns the time that audio data was last captured in milliseconds. There's
+// no contract about what time zero represents, the accuracy, or the granularity
+// of the result. Subsequent calls will generally not return a lower value, but
+// even that's not guaranteed if there's an overflow  wraparound.
+// The reference implementation of this function just returns a constantly
+// incrementing value for each call, since it would need a non-portable platform
+// call to access time information. For real applications, you'll need to write
+// your own platform-specific implementation.
+int32_t LatestAudioTimestamp();
+
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_AUDIO_PROVIDER_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc
index 5f7c7605f0feb3fd3179a0edd5e51574b867ce68..85fbbb80a6c5b330230c1d1d0186de795edc4754 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+
+#include <limits>
+
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
@@ -41,4 +44,27 @@ TF_LITE_MICRO_TEST(TestAudioProvider) {
   }
 }
 
+TF_LITE_MICRO_TEST(TestTimer) {
+  // Make sure that the technically-undefined overflow behavior we rely on below
+  // works on this platform. It's still not guaranteed, but at least this is a
+  // sanity check.  Turn off when running with ASan, as it will complain about
+  // the following undefined behavior.
+#ifndef ADDRESS_SANITIZER
+  int32_t overflow_value = std::numeric_limits<int32_t>::max();
+  overflow_value += 1;
+  TF_LITE_MICRO_EXPECT_EQ(std::numeric_limits<int32_t>::min(), overflow_value);
+#endif
+
+  const int32_t first_time = LatestAudioTimestamp();
+  const int32_t second_time = LatestAudioTimestamp();
+
+  // It's possible that the timer may have wrapped around from +BIG_NUM to
+  // -BIG_NUM between the first and second calls, since we're storing
+  // milliseconds in a 32-bit integer. It's not reasonable that the call itself
+  // would have taken more than 2^31 milliseconds though, so look at the
+  // difference and rely on integer overflow to ensure it's accurate.
+  const int32_t time_delta = (second_time - first_time);
+  TF_LITE_MICRO_EXPECT_LE(0, time_delta);
+}
+
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/Makefile.inc b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/Makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..5585ed7269b71d279f1dd22cb9dd04120e7dd37f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/Makefile.inc
@@ -0,0 +1,7 @@
+# Settings for the Discovery STM32F746NG board.
+ifneq ($(filter disco_f746ng,$(ALL_TAGS)),)
+  MBED_PROJECT_FILES += \
+    AUDIO_DISCO_F746NG.lib \
+    BSP_DISCO_F746NG.lib \
+    SDRAM_DISCO_F746NG.lib
+endif
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/audio_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/audio_provider.cc
new file mode 100644
index 0000000000000000000000000000000000000000..06647d0c536564c26d72cb73396ca36efb3aeb25
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/audio_provider.cc
@@ -0,0 +1,182 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+
+#include "AUDIO_DISCO_F746NG.h"
+#include "SDRAM_DISCO_F746NG.h"
+#include "mbed.h"  // NOLINT
+
+namespace {
+
+bool g_is_audio_initialized = false;
+constexpr int kAudioCaptureBufferSize = kAudioSampleFrequency * 0.5;
+int16_t g_audio_capture_buffer[kAudioCaptureBufferSize];
+int16_t g_audio_output_buffer[kMaxAudioSampleSize];
+int32_t g_latest_audio_timestamp = 0;
+
+// For a full example of how to access audio on the STM32F746NG board, see
+// https://os.mbed.com/teams/ST/code/DISCO-F746NG_AUDIO_demo/
+AUDIO_DISCO_F746NG g_audio_device;
+SDRAM_DISCO_F746NG g_sdram_device;
+
+typedef enum {
+  BUFFER_OFFSET_NONE = 0,
+  BUFFER_OFFSET_HALF = 1,
+  BUFFER_OFFSET_FULL = 2,
+} BUFFER_StateTypeDef;
+
+#define AUDIO_BLOCK_SIZE ((uint32_t)2048)
+#define AUDIO_BUFFER_IN SDRAM_DEVICE_ADDR /* In SDRAM */
+#define AUDIO_BUFFER_OUT \
+  (SDRAM_DEVICE_ADDR + (AUDIO_BLOCK_SIZE * 2)) /* In SDRAM */
+__IO uint32_t g_audio_rec_buffer_state = BUFFER_OFFSET_NONE;
+
+uint8_t SetSysClock_PLL_HSE_200MHz() {
+  RCC_ClkInitTypeDef RCC_ClkInitStruct;
+  RCC_OscInitTypeDef RCC_OscInitStruct;
+
+  // Enable power clock
+  __PWR_CLK_ENABLE();
+
+  // Enable HSE oscillator and activate PLL with HSE as source
+  RCC_OscInitStruct.OscillatorType = RCC_OSCILLATORTYPE_HSE;
+  RCC_OscInitStruct.HSEState = RCC_HSE_ON; /* External xtal on OSC_IN/OSC_OUT */
+
+  // Warning: this configuration is for a 25 MHz xtal clock only
+  RCC_OscInitStruct.PLL.PLLState = RCC_PLL_ON;
+  RCC_OscInitStruct.PLL.PLLSource = RCC_PLLSOURCE_HSE;
+  RCC_OscInitStruct.PLL.PLLM = 25;   // VCO input clock = 1 MHz (25 MHz / 25)
+  RCC_OscInitStruct.PLL.PLLN = 400;  // VCO output clock = 400 MHz (1 MHz * 400)
+  RCC_OscInitStruct.PLL.PLLP = RCC_PLLP_DIV2;  // PLLCLK = 200 MHz (400 MHz / 2)
+  RCC_OscInitStruct.PLL.PLLQ = 8;  // USB clock = 50 MHz (400 MHz / 8)
+
+  if (HAL_RCC_OscConfig(&RCC_OscInitStruct) != HAL_OK) {
+    return 0;  // FAIL
+  }
+
+  // Activate the OverDrive to reach the 216 MHz Frequency
+  if (HAL_PWREx_EnableOverDrive() != HAL_OK) {
+    return 0;  // FAIL
+  }
+
+  // Select PLL as system clock source and configure the HCLK, PCLK1 and PCLK2
+  // clocks dividers
+  RCC_ClkInitStruct.ClockType = (RCC_CLOCKTYPE_SYSCLK | RCC_CLOCKTYPE_HCLK |
+                                 RCC_CLOCKTYPE_PCLK1 | RCC_CLOCKTYPE_PCLK2);
+  RCC_ClkInitStruct.SYSCLKSource = RCC_SYSCLKSOURCE_PLLCLK;  // 200 MHz
+  RCC_ClkInitStruct.AHBCLKDivider = RCC_SYSCLK_DIV1;         // 200 MHz
+  RCC_ClkInitStruct.APB1CLKDivider = RCC_HCLK_DIV4;          //  50 MHz
+  RCC_ClkInitStruct.APB2CLKDivider = RCC_HCLK_DIV2;          // 100 MHz
+
+  if (HAL_RCC_ClockConfig(&RCC_ClkInitStruct, FLASH_LATENCY_7) != HAL_OK) {
+    return 0;  // FAIL
+  }
+  HAL_RCC_MCOConfig(RCC_MCO1, RCC_MCO1SOURCE_HSE, RCC_MCODIV_4);
+  return 1;  // OK
+}
+
+TfLiteStatus InitAudioRecording(tflite::ErrorReporter* error_reporter) {
+  SetSysClock_PLL_HSE_200MHz();
+
+  // Initialize SDRAM buffers.
+  memset((uint16_t*)AUDIO_BUFFER_IN, 0, AUDIO_BLOCK_SIZE * 2);
+  memset((uint16_t*)AUDIO_BUFFER_OUT, 0, AUDIO_BLOCK_SIZE * 2);
+  g_audio_rec_buffer_state = BUFFER_OFFSET_NONE;
+
+  // Start Recording.
+  g_audio_device.IN_Record((uint16_t*)AUDIO_BUFFER_IN, AUDIO_BLOCK_SIZE);
+
+  // Also play results out to headphone jack.
+  g_audio_device.OUT_SetAudioFrameSlot(CODEC_AUDIOFRAME_SLOT_02);
+  g_audio_device.OUT_Play((uint16_t*)AUDIO_BUFFER_OUT, AUDIO_BLOCK_SIZE * 2);
+
+  return kTfLiteOk;
+}
+
+void CaptureSamples(const int16_t* sample_data) {
+  const int sample_size = AUDIO_BLOCK_SIZE / (sizeof(int16_t) * 2);
+  const int32_t time_in_ms =
+      g_latest_audio_timestamp + (sample_size / (kAudioSampleFrequency / 1000));
+
+  const int32_t start_sample_offset =
+      g_latest_audio_timestamp * (kAudioSampleFrequency / 1000);
+  for (int i = 0; i < sample_size; ++i) {
+    const int capture_index =
+        (start_sample_offset + i) % kAudioCaptureBufferSize;
+    g_audio_capture_buffer[capture_index] =
+        (sample_data[(i * 2) + 0] / 2) + (sample_data[(i * 2) + 1] / 2);
+  }
+  // This is how we let the outside world know that new audio data has arrived.
+  g_latest_audio_timestamp = time_in_ms;
+}
+
+}  // namespace
+
+// These callbacks need to be linkable symbols, because they override weak
+// default versions.
+void BSP_AUDIO_IN_TransferComplete_CallBack(void) {
+  g_audio_rec_buffer_state = BUFFER_OFFSET_FULL;
+  /* Copy recorded 1st half block */
+  memcpy((uint16_t*)(AUDIO_BUFFER_OUT), (uint16_t*)(AUDIO_BUFFER_IN),
+         AUDIO_BLOCK_SIZE);
+  CaptureSamples(reinterpret_cast<int16_t*>(AUDIO_BUFFER_IN));
+  return;
+}
+
+// Another weak symbol override.
+void BSP_AUDIO_IN_HalfTransfer_CallBack(void) {
+  g_audio_rec_buffer_state = BUFFER_OFFSET_HALF;
+  /* Copy recorded 2nd half block */
+  memcpy((uint16_t*)(AUDIO_BUFFER_OUT + (AUDIO_BLOCK_SIZE)),
+         (uint16_t*)(AUDIO_BUFFER_IN + (AUDIO_BLOCK_SIZE)), AUDIO_BLOCK_SIZE);
+  CaptureSamples(
+      reinterpret_cast<int16_t*>(AUDIO_BUFFER_IN + AUDIO_BLOCK_SIZE));
+  return;
+}
+
+// Main entry point for getting audio data.
+TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
+                             int start_ms, int duration_ms,
+                             int* audio_samples_size, int16_t** audio_samples) {
+  if (!g_is_audio_initialized) {
+    TfLiteStatus init_status = InitAudioRecording(error_reporter);
+    if (init_status != kTfLiteOk) {
+      return init_status;
+    }
+    g_is_audio_initialized = true;
+  }
+  // This should only be called when the main thread notices that the latest
+  // audio sample data timestamp has changed, so that there's new data in the
+  // capture ring buffer. The ring buffer will eventually wrap around and
+  // overwrite the data, but the assumption is that the main thread is checking
+  // often enough and the buffer is large enough that this call will be made
+  // before that happens.
+  const int start_offset = start_ms * (kAudioSampleFrequency / 1000);
+  const int duration_sample_count =
+      duration_ms * (kAudioSampleFrequency / 1000);
+  for (int i = 0; i < duration_sample_count; ++i) {
+    const int capture_index = (start_offset + i) % kAudioCaptureBufferSize;
+    g_audio_output_buffer[i] = g_audio_capture_buffer[capture_index];
+  }
+
+  *audio_samples_size = kMaxAudioSampleSize;
+  *audio_samples = g_audio_output_buffer;
+  return kTfLiteOk;
+}
+
+int32_t LatestAudioTimestamp() { return g_latest_audio_timestamp; }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/timer.cc
similarity index 81%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/timer.cc
index 6c96a61ab517487413e875dc7369bddb1c9a0d9a..a8f0fe4bd50c3b6d16a426adc461ea125cbc9859 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/timer.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/timer.h"
 
-int32_t TimeInMilliseconds() {
-  static int current_time = 0;
-  current_time += 100;
-  return current_time;
+namespace {
+int32_t g_current_time = 0;
 }
+
+void SetTimeInMilliseconds(int32_t time) { g_current_time = time; }
+
+int32_t TimeInMilliseconds() { return g_current_time; }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
index c4c52ac0ff3696a05192465f8ac911b5d6a83925..7f9ece41dd3f013ae328ffd1bdc98f197855a131 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
@@ -18,20 +18,11 @@ limitations under the License.
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/timer.h"
-
-namespace {
-// Stores the timestamp for the previous fetch of audio data, so that we can
-// avoid recalculating all the features from scratch if some earlier timeslices
-// are still present.
-int32_t g_last_time_in_ms = 0;
-// Make sure we don't try to use cached information if this is the first call
-// into the provider.
-bool g_is_first_run = true;
-}  // namespace
 
 FeatureProvider::FeatureProvider(int feature_size, uint8_t* feature_data)
-    : feature_size_(feature_size), feature_data_(feature_data) {
+    : feature_size_(feature_size),
+      feature_data_(feature_data),
+      is_first_run_(true) {
   // Initialize the feature data to default values.
   for (int n = 0; n < feature_size_; ++n) {
     feature_data_[n] = 0;
@@ -41,24 +32,23 @@ FeatureProvider::FeatureProvider(int feature_size, uint8_t* feature_data)
 FeatureProvider::~FeatureProvider() {}
 
 TfLiteStatus FeatureProvider::PopulateFeatureData(
-    tflite::ErrorReporter* error_reporter, int* how_many_new_slices) {
+    tflite::ErrorReporter* error_reporter, int32_t last_time_in_ms,
+    int32_t time_in_ms, int* how_many_new_slices) {
   if (feature_size_ != kFeatureElementCount) {
     error_reporter->Report("Requested feature_data_ size %d doesn't match %d",
                            feature_size_, kFeatureElementCount);
     return kTfLiteError;
   }
 
-  const int32_t time_in_ms = TimeInMilliseconds();
   // Quantize the time into steps as long as each window stride, so we can
   // figure out which audio data we need to fetch.
-  const int last_step = (g_last_time_in_ms / kFeatureSliceStrideMs);
+  const int last_step = (last_time_in_ms / kFeatureSliceStrideMs);
   const int current_step = (time_in_ms / kFeatureSliceStrideMs);
-  g_last_time_in_ms = time_in_ms;
 
   int slices_needed = current_step - last_step;
   // If this is the first call, make sure we don't use any cached information.
-  if (g_is_first_run) {
-    g_is_first_run = false;
+  if (is_first_run_) {
+    is_first_run_ = false;
     slices_needed = kFeatureSliceCount;
   }
   if (slices_needed > kFeatureSliceCount) {
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h
index a86c56ebf053a8807e38c42c6a7088c146a31b9e..ee3a480e947eced06e30ac089433f44e18d6adc3 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h
@@ -38,11 +38,15 @@ class FeatureProvider {
   // Fills the feature data with information from audio inputs, and returns how
   // many feature slices were updated.
   TfLiteStatus PopulateFeatureData(tflite::ErrorReporter* error_reporter,
+                                   int32_t last_time_in_ms, int32_t time_in_ms,
                                    int* how_many_new_slices);
 
  private:
   int feature_size_;
   uint8_t* feature_data_;
+  // Make sure we don't try to use cached information if this is the first call
+  // into the provider.
+  bool is_first_run_;
 };
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_FEATURE_PROVIDER_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc
index 1e52aec8d2741678a0f79f643bb7dcf42c848a58..556cbfe799bd9adf2df8f584a4f10b4a1c834bd4 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc
@@ -30,7 +30,8 @@ TF_LITE_MICRO_TEST(TestFeatureProvider) {
 
   int how_many_new_slices = 0;
   TfLiteStatus populate_status = feature_provider.PopulateFeatureData(
-      error_reporter, &how_many_new_slices);
+      error_reporter, /* last_time_in_ms= */ 0, /* time_in_ms= */ 10000,
+      &how_many_new_slices);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, populate_status);
   TF_LITE_MICRO_EXPECT_EQ(kFeatureSliceCount, how_many_new_slices);
 }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc
index 1890c25cf2b44c96c549757b31f88255d4a9ee09..3a9a5a4df1bf8239950dd2c79a1048706004e1f5 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
@@ -68,16 +70,21 @@ int main(int argc, char* argv[]) {
   FeatureProvider feature_provider(kFeatureElementCount,
                                    model_input->data.uint8);
 
+  RecognizeCommands recognizer(error_reporter);
+
+  int32_t previous_time = 0;
   // Keep reading and analysing audio data in an infinite loop.
   while (true) {
     // Fetch the spectrogram for the current time.
+    const int32_t current_time = LatestAudioTimestamp();
     int how_many_new_slices = 0;
     TfLiteStatus feature_status = feature_provider.PopulateFeatureData(
-        error_reporter, &how_many_new_slices);
+        error_reporter, previous_time, current_time, &how_many_new_slices);
     if (feature_status != kTfLiteOk) {
       error_reporter->Report("Feature generation failed");
       return 1;
     }
+    previous_time = current_time;
     // If no new audio samples have been received since last time, don't bother
     // running the network model.
     if (how_many_new_slices == 0) {
@@ -105,7 +112,19 @@ int main(int argc, char* argv[]) {
       }
     }
 
-    error_reporter->Report("Heard %s", kCategoryLabels[top_category_index]);
+    const char* found_command = nullptr;
+    uint8_t score = 0;
+    bool is_new_command = false;
+    TfLiteStatus process_status = recognizer.ProcessLatestResults(
+        output, current_time, &found_command, &score, &is_new_command);
+    if (process_status != kTfLiteOk) {
+      error_reporter->Report(
+          "RecognizeCommands::ProcessLatestResults() failed");
+      return 1;
+    }
+    if (is_new_command) {
+      error_reporter->Report("Heard %s (%d)", found_command, score);
+    }
   }
 
   return 0;
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h b/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h
index 1d8f3123a57bc5b807d39151adaf64f29d2f5f95..f48252d14d251673f0070e63dfa4169ca3a89025 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h
@@ -23,6 +23,7 @@ limitations under the License.
 // frequency information. This has to be a power of two, and since we're dealing
 // with 30ms of 16KHz inputs, which means 480 samples, this is the next value.
 constexpr int kMaxAudioSampleSize = 512;
+constexpr int kAudioSampleFrequency = 16000;
 
 // All of these values are derived from the values used during model training,
 // if you change your model you'll need to update these constants.
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/osx/Makefile.inc b/tensorflow/lite/experimental/micro/examples/micro_speech/osx/Makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..8f8b33a9fa2afca902ef5fbcfa7f641b5cc58028
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/osx/Makefile.inc
@@ -0,0 +1,8 @@
+# Settings for Mac OS platforms.
+ifeq ($(TARGET), osx)
+  LINKER_FLAGS := \
+    -framework Foundation \
+    -framework AudioToolbox
+
+  MICROLITE_LIBS += $(LINKER_FLAGS)
+endif
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/osx/audio_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/osx/audio_provider.cc
new file mode 100644
index 0000000000000000000000000000000000000000..892757e799f3832db725424163e613bea35ab9e7
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/osx/audio_provider.cc
@@ -0,0 +1,139 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+
+#include <AudioToolbox/AudioToolbox.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+
+namespace {
+
+constexpr int kNumberRecordBuffers = 3;
+bool g_is_audio_initialized = false;
+constexpr int kAudioCaptureBufferSize = kAudioSampleFrequency * 0.5;
+int16_t g_audio_capture_buffer[kAudioCaptureBufferSize];
+int16_t g_audio_output_buffer[kMaxAudioSampleSize];
+int32_t g_latest_audio_timestamp = 0;
+
+// Checks for MacOS errors, prints information and returns a TF Lite version.
+#define RETURN_IF_OS_ERROR(error, error_reporter)                       \
+  do {                                                                  \
+    if (error != noErr) {                                               \
+      error_reporter->Report("Error: %s:%d (%d)\n", __FILE__, __LINE__, \
+                             error);                                    \
+      return kTfLiteError;                                              \
+    }                                                                   \
+  } while (0);
+
+// Called when an audio input buffer has been filled.
+void OnAudioBufferFilledCallback(
+    void* user_data, AudioQueueRef queue, AudioQueueBufferRef buffer,
+    const AudioTimeStamp* start_time, UInt32 num_packets,
+    const AudioStreamPacketDescription* packet_description) {
+  const int sample_size = buffer->mAudioDataByteSize / sizeof(float);
+  const int64_t sample_offset = start_time->mSampleTime;
+  const int32_t time_in_ms =
+      (sample_offset + sample_size) / (kAudioSampleFrequency / 1000);
+  const float* float_samples = static_cast<const float*>(buffer->mAudioData);
+  for (int i = 0; i < sample_size; ++i) {
+    const int capture_index = (sample_offset + i) % kAudioCaptureBufferSize;
+    g_audio_capture_buffer[capture_index] = float_samples[i] * ((1 << 15) - 1);
+  }
+  // This is how we let the outside world know that new audio data has arrived.
+  g_latest_audio_timestamp = time_in_ms;
+  AudioQueueEnqueueBuffer(queue, buffer, 0, nullptr);
+}
+
+// Set up everything we need to capture audio samples from the default recording
+// device on MacOS.
+TfLiteStatus InitAudioRecording(tflite::ErrorReporter* error_reporter) {
+  // Set up the format of the audio - single channel, 32-bit float at 16KHz.
+  AudioStreamBasicDescription recordFormat = {0};
+  recordFormat.mSampleRate = kAudioSampleFrequency;
+  recordFormat.mFormatID = kAudioFormatLinearPCM;
+  recordFormat.mFormatFlags =
+      kAudioFormatFlagIsFloat | kAudioFormatFlagIsPacked;
+  recordFormat.mBitsPerChannel = 8 * sizeof(float);
+  recordFormat.mChannelsPerFrame = 1;
+  recordFormat.mBytesPerFrame = sizeof(float) * recordFormat.mChannelsPerFrame;
+  recordFormat.mFramesPerPacket = 1;
+  recordFormat.mBytesPerPacket =
+      recordFormat.mBytesPerFrame * recordFormat.mFramesPerPacket;
+  recordFormat.mReserved = 0;
+
+  UInt32 propSize = sizeof(recordFormat);
+  RETURN_IF_OS_ERROR(AudioFormatGetProperty(kAudioFormatProperty_FormatInfo, 0,
+                                            NULL, &propSize, &recordFormat),
+                     error_reporter);
+
+  // Create a recording queue.
+  AudioQueueRef queue;
+  RETURN_IF_OS_ERROR(
+      AudioQueueNewInput(&recordFormat, OnAudioBufferFilledCallback,
+                         error_reporter, nullptr, nullptr, 0, &queue),
+      error_reporter);
+
+  // Set up the buffers we'll need.
+  int buffer_bytes = 512 * sizeof(float);
+  for (int i = 0; i < kNumberRecordBuffers; ++i) {
+    AudioQueueBufferRef buffer;
+    RETURN_IF_OS_ERROR(AudioQueueAllocateBuffer(queue, buffer_bytes, &buffer),
+                       error_reporter);
+    RETURN_IF_OS_ERROR(AudioQueueEnqueueBuffer(queue, buffer, 0, nullptr),
+                       error_reporter);
+  }
+
+  // Start capturing audio.
+  RETURN_IF_OS_ERROR(AudioQueueStart(queue, nullptr), error_reporter);
+
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
+                             int start_ms, int duration_ms,
+                             int* audio_samples_size, int16_t** audio_samples) {
+  if (!g_is_audio_initialized) {
+    TfLiteStatus init_status = InitAudioRecording(error_reporter);
+    if (init_status != kTfLiteOk) {
+      return init_status;
+    }
+    for (int i = 0; i < kMaxAudioSampleSize; ++i) {
+      g_audio_output_buffer[i] = 0;
+    }
+    g_is_audio_initialized = true;
+  }
+  // This should only be called when the main thread notices that the latest
+  // audio sample data timestamp has changed, so that there's new data in the
+  // capture ring buffer. The ring buffer will eventually wrap around and
+  // overwrite the data, but the assumption is that the main thread is checking
+  // often enough and the buffer is large enough that this call will be made
+  // before that happens.
+  const int start_offset = start_ms * (kAudioSampleFrequency / 1000);
+  const int duration_sample_count =
+      duration_ms * (kAudioSampleFrequency / 1000);
+  for (int i = 0; i < duration_sample_count; ++i) {
+    const int capture_index = (start_offset + i) % kAudioCaptureBufferSize;
+    g_audio_output_buffer[i] = g_audio_capture_buffer[capture_index];
+  }
+
+  *audio_samples_size = kMaxAudioSampleSize;
+  *audio_samples = g_audio_output_buffer;
+  return kTfLiteOk;
+}
+
+int32_t LatestAudioTimestamp() { return g_latest_audio_timestamp; }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
index f4a7f801cc6251b82339509f691fd64012fbe390..f8858aad72f3c141d20077ffa927e30bd9492987 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
@@ -32,6 +32,9 @@ limitations under the License.
 
 namespace {
 
+// Needed because some platforms don't have M_PI defined.
+constexpr float kPi = 3.14159265358979323846f;
+
 // Performs a discrete Fourier transform on the real inputs. This corresponds to
 // rdft() in the FFT package at http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html,
 // and to kiss_fftr() in KISSFFT at https://github.com/mborgerding/kissfft.
@@ -48,11 +51,11 @@ void CalculateDiscreteFourierTransform(float* time_series, int time_series_size,
   for (int i = 0; i < time_series_size / 2; ++i) {
     float real = 0;
     for (int j = 0; j < time_series_size; ++j) {
-      real += time_series[j] * cos(j * i * M_PI * 2 / time_series_size);
+      real += time_series[j] * cos(j * i * kPi * 2 / time_series_size);
     }
     float imaginary = 0;
     for (int j = 0; j < time_series_size; ++j) {
-      imaginary -= time_series[j] * sin(j * i * M_PI * 2 / time_series_size);
+      imaginary -= time_series[j] * sin(j * i * kPi * 2 / time_series_size);
     }
     fourier_output[(i * 2) + 0] = real;
     fourier_output[(i * 2) + 1] = imaginary;
@@ -63,7 +66,7 @@ void CalculateDiscreteFourierTransform(float* time_series, int time_series_size,
 // of the current sample window are weighted more heavily than those at the end.
 void CalculatePeriodicHann(int window_length, float* window_function) {
   for (int i = 0; i < window_length; ++i) {
-    window_function[i] = 0.5 - 0.5 * cos((2 * M_PI * i) / window_length);
+    window_function[i] = 0.5 - 0.5 * cos((2 * kPi * i) / window_length);
   }
 }
 
@@ -143,3 +146,12 @@ TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
   }
   return kTfLiteOk;
 }
+
+TfLiteStatus Preprocess_1sec(tflite::ErrorReporter* error_reporter,
+                             const int16_t* input, uint8_t* output) {
+  int i;
+  for (i = 0; i < 49; i++) {
+    Preprocess(error_reporter, input + i * 320, 480, 43, output + i * 43);
+  }
+  return kTfLiteOk;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h
index adff790d6cc527578dbfb9dc481c99c1021b92db..d710beeceea6a7b6fb7fca748e5795f602276e32 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h
@@ -28,4 +28,7 @@ TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
                         const int16_t* input, int input_size, int output_size,
                         uint8_t* output);
 
+TfLiteStatus Preprocess_1sec(tflite::ErrorReporter* error_reporter,
+                             const int16_t* input, uint8_t* output);
+
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_PREPROCESSOR_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9366dc71e0d76d087a3dad9b9c4c206a0749e235
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc
@@ -0,0 +1,139 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h"
+
+#include <limits>
+
+RecognizeCommands::RecognizeCommands(tflite::ErrorReporter* error_reporter,
+                                     int32_t average_window_duration_ms,
+                                     uint8_t detection_threshold,
+                                     int32_t suppression_ms,
+                                     int32_t minimum_count)
+    : error_reporter_(error_reporter),
+      average_window_duration_ms_(average_window_duration_ms),
+      detection_threshold_(detection_threshold),
+      suppression_ms_(suppression_ms),
+      minimum_count_(minimum_count),
+      previous_results_(error_reporter) {
+  previous_top_label_ = "_silence_";
+  previous_top_label_time_ = 0;
+}
+
+TfLiteStatus RecognizeCommands::ProcessLatestResults(
+    const TfLiteTensor* latest_results, const int32_t current_time_ms,
+    const char** found_command, uint8_t* score, bool* is_new_command) {
+  if ((latest_results->dims->size != 2) ||
+      (latest_results->dims->data[0] != 1) ||
+      (latest_results->dims->data[1] != kCategoryCount)) {
+    error_reporter_->Report(
+        "The results for recognition should contain %d elements, but there are "
+        "%d in an %d-dimensional shape",
+        kCategoryCount, latest_results->dims->data[1],
+        latest_results->dims->size);
+    return kTfLiteError;
+  }
+
+  if (latest_results->type != kTfLiteUInt8) {
+    error_reporter_->Report(
+        "The results for recognition should be uint8 elements, but are %d",
+        latest_results->type);
+    return kTfLiteError;
+  }
+
+  if ((!previous_results_.empty()) &&
+      (current_time_ms < previous_results_.front().time_)) {
+    error_reporter_->Report(
+        "Results must be fed in increasing time order, but received a "
+        "timestamp of %d that was earlier than the previous one of %d",
+        current_time_ms, previous_results_.front().time_);
+    return kTfLiteError;
+  }
+
+  // Add the latest results to the head of the queue.
+  previous_results_.push_back({current_time_ms, latest_results->data.uint8});
+
+  // Prune any earlier results that are too old for the averaging window.
+  const int64_t time_limit = current_time_ms - average_window_duration_ms_;
+  while ((!previous_results_.empty()) &&
+         previous_results_.front().time_ < time_limit) {
+    previous_results_.pop_front();
+  }
+
+  // If there are too few results, assume the result will be unreliable and
+  // bail.
+  const int64_t how_many_results = previous_results_.size();
+  const int64_t earliest_time = previous_results_.front().time_;
+  const int64_t samples_duration = current_time_ms - earliest_time;
+  if ((how_many_results < minimum_count_) ||
+      (samples_duration < (average_window_duration_ms_ / 4))) {
+    *found_command = previous_top_label_;
+    *score = 0;
+    *is_new_command = false;
+    return kTfLiteOk;
+  }
+
+  // Calculate the average score across all the results in the window.
+  int32_t average_scores[kCategoryCount];
+  for (int offset = 0; offset < previous_results_.size(); ++offset) {
+    PreviousResultsQueue::Result previous_result =
+        previous_results_.from_front(offset);
+    const uint8_t* scores = previous_result.scores_;
+    for (int i = 0; i < kCategoryCount; ++i) {
+      if (offset == 0) {
+        average_scores[i] = scores[i];
+      } else {
+        average_scores[i] += scores[i];
+      }
+    }
+  }
+  for (int i = 0; i < kCategoryCount; ++i) {
+    average_scores[i] /= how_many_results;
+  }
+
+  // Find the current highest scoring category.
+  int current_top_index = 0;
+  int32_t current_top_score = 0;
+  for (int i = 0; i < kCategoryCount; ++i) {
+    if (average_scores[i] > current_top_score) {
+      current_top_score = average_scores[i];
+      current_top_index = i;
+    }
+  }
+  const char* current_top_label = kCategoryLabels[current_top_index];
+
+  // If we've recently had another label trigger, assume one that occurs too
+  // soon afterwards is a bad result.
+  int64_t time_since_last_top;
+  if ((previous_top_label_ == kCategoryLabels[0]) ||
+      (previous_top_label_time_ == std::numeric_limits<int32_t>::min())) {
+    time_since_last_top = std::numeric_limits<int32_t>::max();
+  } else {
+    time_since_last_top = current_time_ms - previous_top_label_time_;
+  }
+  if ((current_top_score > detection_threshold_) &&
+      (current_top_label != previous_top_label_) &&
+      (time_since_last_top > suppression_ms_)) {
+    previous_top_label_ = current_top_label;
+    previous_top_label_time_ = current_time_ms;
+    *is_new_command = true;
+  } else {
+    *is_new_command = false;
+  }
+  *found_command = current_top_label;
+  *score = current_top_score;
+
+  return kTfLiteOk;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h
new file mode 100644
index 0000000000000000000000000000000000000000..adefffe850076821dd1e0bf683fdd2180d6999ea
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h
@@ -0,0 +1,158 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_RECOGNIZE_COMMANDS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_RECOGNIZE_COMMANDS_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+// Partial implementation of std::dequeue, just providing the functionality
+// that's needed to keep a record of previous neural network results over a
+// short time period, so they can be averaged together to produce a more
+// accurate overall prediction. This doesn't use any dynamic memory allocation
+// so it's a better fit for microcontroller applications, but this does mean
+// there are hard limits on the number of results it can store.
+class PreviousResultsQueue {
+ public:
+  PreviousResultsQueue(tflite::ErrorReporter* error_reporter)
+      : error_reporter_(error_reporter), front_index_(0), size_(0) {}
+
+  // Data structure that holds an inference result, and the time when it
+  // was recorded.
+  struct Result {
+    Result() : time_(0), scores_() {}
+    Result(int32_t time, uint8_t* scores) : time_(time) {
+      for (int i = 0; i < kCategoryCount; ++i) {
+        scores_[i] = scores[i];
+      }
+    }
+    int32_t time_;
+    uint8_t scores_[kCategoryCount];
+  };
+
+  int size() { return size_; }
+  bool empty() { return size_ == 0; }
+  Result& front() { return results_[front_index_]; }
+  Result& back() {
+    int back_index = front_index_ + (size_ - 1);
+    if (back_index >= kMaxResults) {
+      back_index -= kMaxResults;
+    }
+    return results_[back_index];
+  }
+
+  void push_back(const Result& entry) {
+    if (size() >= kMaxResults) {
+      error_reporter_->Report(
+          "Couldn't push_back latest result, too many already!");
+      return;
+    }
+    size_ += 1;
+    back() = entry;
+  }
+
+  Result pop_front() {
+    if (size() <= 0) {
+      error_reporter_->Report("Couldn't pop_front result, none present!");
+      return Result();
+    }
+    Result result = front();
+    front_index_ += 1;
+    if (front_index_ >= kMaxResults) {
+      front_index_ = 0;
+    }
+    size_ -= 1;
+    return result;
+  }
+
+  // Most of the functions are duplicates of dequeue containers, but this
+  // is a helper that makes it easy to iterate through the contents of the
+  // queue.
+  Result& from_front(int offset) {
+    if ((offset < 0) || (offset >= size_)) {
+      error_reporter_->Report("Attempt to read beyond the end of the queue!");
+      offset = size_ - 1;
+    }
+    int index = front_index_ + offset;
+    if (index >= kMaxResults) {
+      index -= kMaxResults;
+    }
+    return results_[index];
+  }
+
+ private:
+  tflite::ErrorReporter* error_reporter_;
+  static constexpr int kMaxResults = 50;
+  Result results_[kMaxResults];
+
+  int front_index_;
+  int size_;
+};
+
+// This class is designed to apply a very primitive decoding model on top of the
+// instantaneous results from running an audio recognition model on a single
+// window of samples. It applies smoothing over time so that noisy individual
+// label scores are averaged, increasing the confidence that apparent matches
+// are real.
+// To use it, you should create a class object with the configuration you
+// want, and then feed results from running a TensorFlow model into the
+// processing method. The timestamp for each subsequent call should be
+// increasing from the previous, since the class is designed to process a stream
+// of data over time.
+class RecognizeCommands {
+ public:
+  // labels should be a list of the strings associated with each one-hot score.
+  // The window duration controls the smoothing. Longer durations will give a
+  // higher confidence that the results are correct, but may miss some commands.
+  // The detection threshold has a similar effect, with high values increasing
+  // the precision at the cost of recall. The minimum count controls how many
+  // results need to be in the averaging window before it's seen as a reliable
+  // average. This prevents erroneous results when the averaging window is
+  // initially being populated for example. The suppression argument disables
+  // further recognitions for a set time after one has been triggered, which can
+  // help reduce spurious recognitions.
+  explicit RecognizeCommands(tflite::ErrorReporter* error_reporter,
+                             int32_t average_window_duration_ms = 1000,
+                             uint8_t detection_threshold = 51,
+                             int32_t suppression_ms = 500,
+                             int32_t minimum_count = 3);
+
+  // Call this with the results of running a model on sample data.
+  TfLiteStatus ProcessLatestResults(const TfLiteTensor* latest_results,
+                                    const int32_t current_time_ms,
+                                    const char** found_command, uint8_t* score,
+                                    bool* is_new_command);
+
+ private:
+  // Configuration
+  tflite::ErrorReporter* error_reporter_;
+  int32_t average_window_duration_ms_;
+  uint8_t detection_threshold_;
+  int32_t suppression_ms_;
+  int32_t minimum_count_;
+
+  // Working variables
+  PreviousResultsQueue previous_results_;
+  int previous_results_head_;
+  int previous_results_tail_;
+  const char* previous_top_label_;
+  int32_t previous_top_label_time_;
+};
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_RECOGNIZE_COMMANDS_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f0cc73f10b3dadfdf06cb0f2935140b792635add
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc
@@ -0,0 +1,207 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h"
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(PreviousResultsQueueBasic) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  PreviousResultsQueue queue(error_reporter);
+  TF_LITE_MICRO_EXPECT_EQ(0, queue.size());
+
+  uint8_t scores_a[4] = {0, 0, 0, 1};
+  queue.push_back({0, scores_a});
+  TF_LITE_MICRO_EXPECT_EQ(1, queue.size());
+  TF_LITE_MICRO_EXPECT_EQ(0, queue.front().time_);
+  TF_LITE_MICRO_EXPECT_EQ(0, queue.back().time_);
+
+  uint8_t scores_b[4] = {0, 0, 1, 0};
+  queue.push_back({1, scores_b});
+  TF_LITE_MICRO_EXPECT_EQ(2, queue.size());
+  TF_LITE_MICRO_EXPECT_EQ(0, queue.front().time_);
+  TF_LITE_MICRO_EXPECT_EQ(1, queue.back().time_);
+
+  PreviousResultsQueue::Result pop_result = queue.pop_front();
+  TF_LITE_MICRO_EXPECT_EQ(0, pop_result.time_);
+  TF_LITE_MICRO_EXPECT_EQ(1, queue.size());
+  TF_LITE_MICRO_EXPECT_EQ(1, queue.front().time_);
+  TF_LITE_MICRO_EXPECT_EQ(1, queue.back().time_);
+
+  uint8_t scores_c[4] = {0, 1, 0, 0};
+  queue.push_back({2, scores_c});
+  TF_LITE_MICRO_EXPECT_EQ(2, queue.size());
+  TF_LITE_MICRO_EXPECT_EQ(1, queue.front().time_);
+  TF_LITE_MICRO_EXPECT_EQ(2, queue.back().time_);
+}
+
+TF_LITE_MICRO_TEST(PreviousResultsQueuePushPop) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  PreviousResultsQueue queue(error_reporter);
+  TF_LITE_MICRO_EXPECT_EQ(0, queue.size());
+
+  for (int i = 0; i < 123; ++i) {
+    uint8_t scores[4] = {0, 0, 0, 1};
+    queue.push_back({i, scores});
+    TF_LITE_MICRO_EXPECT_EQ(1, queue.size());
+    TF_LITE_MICRO_EXPECT_EQ(i, queue.front().time_);
+    TF_LITE_MICRO_EXPECT_EQ(i, queue.back().time_);
+
+    PreviousResultsQueue::Result pop_result = queue.pop_front();
+    TF_LITE_MICRO_EXPECT_EQ(i, pop_result.time_);
+    TF_LITE_MICRO_EXPECT_EQ(0, queue.size());
+  }
+}
+
+TF_LITE_MICRO_TEST(RecognizeCommandsTestBasic) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  RecognizeCommands recognize_commands(error_reporter);
+
+  TfLiteTensor results = tflite::testing::CreateQuantizedTensor(
+      {255, 0, 0, 0}, tflite::testing::IntArrayFromInitializer({2, 1, 4}),
+      "input_tensor", 0.0f, 128.0f);
+
+  const char* found_command;
+  uint8_t score;
+  bool is_new_command;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, recognize_commands.ProcessLatestResults(
+                     &results, 0, &found_command, &score, &is_new_command));
+}
+
+TF_LITE_MICRO_TEST(RecognizeCommandsTestFindCommands) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  RecognizeCommands recognize_commands(error_reporter, 1000, 51);
+
+  TfLiteTensor yes_results = tflite::testing::CreateQuantizedTensor(
+      {0, 0, 255, 0}, tflite::testing::IntArrayFromInitializer({2, 1, 4}),
+      "input_tensor", 0.0f, 128.0f);
+
+  bool has_found_new_command = false;
+  const char* new_command;
+  for (int i = 0; i < 10; ++i) {
+    const char* found_command;
+    uint8_t score;
+    bool is_new_command;
+    int32_t current_time_ms = 0 + (i * 100);
+    TF_LITE_MICRO_EXPECT_EQ(
+        kTfLiteOk, recognize_commands.ProcessLatestResults(
+                       &yes_results, current_time_ms, &found_command, &score,
+                       &is_new_command));
+    if (is_new_command) {
+      TF_LITE_MICRO_EXPECT(!has_found_new_command);
+      has_found_new_command = true;
+      new_command = found_command;
+    }
+  }
+  TF_LITE_MICRO_EXPECT(has_found_new_command);
+  TF_LITE_MICRO_EXPECT_EQ(0, tflite::testing::TestStrcmp("yes", new_command));
+
+  TfLiteTensor no_results = tflite::testing::CreateQuantizedTensor(
+      {0, 0, 0, 255}, tflite::testing::IntArrayFromInitializer({2, 1, 4}),
+      "input_tensor", 0.0f, 128.0f);
+  has_found_new_command = false;
+  new_command = "";
+  uint8_t score;
+  for (int i = 0; i < 10; ++i) {
+    const char* found_command;
+    bool is_new_command;
+    int32_t current_time_ms = 1000 + (i * 100);
+    TF_LITE_MICRO_EXPECT_EQ(
+        kTfLiteOk, recognize_commands.ProcessLatestResults(
+                       &no_results, current_time_ms, &found_command, &score,
+                       &is_new_command));
+    if (is_new_command) {
+      TF_LITE_MICRO_EXPECT(!has_found_new_command);
+      has_found_new_command = true;
+      new_command = found_command;
+    }
+  }
+  TF_LITE_MICRO_EXPECT(has_found_new_command);
+  TF_LITE_MICRO_EXPECT_EQ(231, score);
+  TF_LITE_MICRO_EXPECT_EQ(0, tflite::testing::TestStrcmp("no", new_command));
+}
+
+TF_LITE_MICRO_TEST(RecognizeCommandsTestBadInputLength) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  RecognizeCommands recognize_commands(error_reporter, 1000, 51);
+
+  TfLiteTensor bad_results = tflite::testing::CreateQuantizedTensor(
+      {0, 0, 255}, tflite::testing::IntArrayFromInitializer({2, 1, 3}),
+      "input_tensor", 0.0f, 128.0f);
+
+  const char* found_command;
+  uint8_t score;
+  bool is_new_command;
+  TF_LITE_MICRO_EXPECT_NE(
+      kTfLiteOk, recognize_commands.ProcessLatestResults(
+                     &bad_results, 0, &found_command, &score, &is_new_command));
+}
+
+TF_LITE_MICRO_TEST(RecognizeCommandsTestBadInputTimes) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  RecognizeCommands recognize_commands(error_reporter, 1000, 51);
+
+  TfLiteTensor results = tflite::testing::CreateQuantizedTensor(
+      {0, 0, 255, 0}, tflite::testing::IntArrayFromInitializer({2, 1, 4}),
+      "input_tensor", 0.0f, 128.0f);
+
+  const char* found_command;
+  uint8_t score;
+  bool is_new_command;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, recognize_commands.ProcessLatestResults(
+                     &results, 100, &found_command, &score, &is_new_command));
+  TF_LITE_MICRO_EXPECT_NE(
+      kTfLiteOk, recognize_commands.ProcessLatestResults(
+                     &results, 0, &found_command, &score, &is_new_command));
+}
+
+TF_LITE_MICRO_TEST(RecognizeCommandsTestTooFewInputs) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  RecognizeCommands recognize_commands(error_reporter, 1000, 51);
+
+  TfLiteTensor results = tflite::testing::CreateQuantizedTensor(
+      {0, 0, 255, 0}, tflite::testing::IntArrayFromInitializer({2, 1, 4}),
+      "input_tensor", 0.0f, 128.0f);
+
+  const char* found_command;
+  uint8_t score;
+  bool is_new_command;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, recognize_commands.ProcessLatestResults(
+                     &results, 100, &found_command, &score, &is_new_command));
+  TF_LITE_MICRO_EXPECT_EQ(0, score);
+  TF_LITE_MICRO_EXPECT_EQ(false, is_new_command);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/timer_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/timer_test.cc
deleted file mode 100644
index 0487a12b25fc17208f1d9ab2b51538102f7ec914..0000000000000000000000000000000000000000
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/timer_test.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/timer.h"
-
-#include <limits>
-
-#include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
-#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
-
-TF_LITE_MICRO_TESTS_BEGIN
-
-TF_LITE_MICRO_TEST(TestTimer) {
-  // Make sure that the technically-undefined overflow behavior we rely on below
-  // works on this platform. It's still not guaranteed, but at least this is a
-  // sanity check.  Turn off when running with ASan, as it will complain about
-  // the following undefined behavior.
-#ifndef ADDRESS_SANITIZER
-  int32_t overflow_value = std::numeric_limits<int32_t>::max();
-  overflow_value += 1;
-  TF_LITE_MICRO_EXPECT_EQ(std::numeric_limits<int32_t>::min(), overflow_value);
-#endif
-
-  const int32_t first_time = TimeInMilliseconds();
-  const int32_t second_time = TimeInMilliseconds();
-
-  // It's possible that the timer may have wrapped around from +BIG_NUM to
-  // -BIG_NUM between the first and second calls, since we're storing
-  // milliseconds in a 32-bit integer. It's not reasonable that the call itself
-  // would have taken more than 2^31 milliseconds though, so look at the
-  // difference and rely on integer overflow to ensure it's accurate.
-  const int32_t time_delta = (second_time - first_time);
-  TF_LITE_MICRO_EXPECT_LE(0, time_delta);
-}
-
-TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/kernels/BUILD b/tensorflow/lite/experimental/micro/kernels/BUILD
index a54fd41760d58f2023e6b7b2aac72ac5f5e95ae3..47ac85c605488bdaa30515325122019a2d88678f 100644
--- a/tensorflow/lite/experimental/micro/kernels/BUILD
+++ b/tensorflow/lite/experimental/micro/kernels/BUILD
@@ -48,22 +48,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "test_utils",
-    srcs = [
-    ],
-    hdrs = [
-        "test_utils.h",
-    ],
-    copts = tflite_copts(),
-    deps = [
-        "//tensorflow/lite/c:c_api_internal",
-        "//tensorflow/lite/core/api",
-        "//tensorflow/lite/experimental/micro:micro_framework",
-        "//tensorflow/lite/experimental/micro/testing:micro_test",
-    ],
-)
-
 tflite_micro_cc_test(
     name = "depthwise_conv_test",
     srcs = [
@@ -71,7 +55,6 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
-        ":test_utils",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
@@ -85,7 +68,6 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
-        ":test_utils",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
@@ -99,7 +81,6 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
-        ":test_utils",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
diff --git a/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc b/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc
index f70437a4b943e6e71547e010a0fea9ab551194db..05ba8798c0dc34eab5c563489cf9fc928325d00f 100644
--- a/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/lite/experimental/micro/kernels/test_utils.h"
 #include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/experimental/micro/kernels/fully_connected_test.cc b/tensorflow/lite/experimental/micro/kernels/fully_connected_test.cc
index 300f8aaf78ad38a2cd4a7c715cf63315a0b2e751..c2e1446848db68a4be42eab282da34e38999670f 100644
--- a/tensorflow/lite/experimental/micro/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/fully_connected_test.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/lite/experimental/micro/kernels/test_utils.h"
 #include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/experimental/micro/kernels/softmax_test.cc b/tensorflow/lite/experimental/micro/kernels/softmax_test.cc
index 7253b3be8ce20ff6d30ca725060da606c416c8e1..8933b6c0ed090b175c5d42282dc0ec6f22142206 100644
--- a/tensorflow/lite/experimental/micro/kernels/softmax_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/softmax_test.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/lite/experimental/micro/kernels/test_utils.h"
 #include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/experimental/micro/mbed/debug_log.cc b/tensorflow/lite/experimental/micro/mbed/debug_log.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d4a4a5a8429bb7867c225a97696c28eb5ad8d3b7
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/mbed/debug_log.cc
@@ -0,0 +1,24 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/debug_log.h"
+
+#include <mbed.h>
+
+// On mbed platforms, we set up a serial port and write to it for debug logging.
+extern "C" void DebugLog(const char* s) {
+  static Serial pc(USBTX, USBRX);
+  pc.printf("%s", s);
+}
diff --git a/tensorflow/lite/experimental/micro/micro_error_reporter.h b/tensorflow/lite/experimental/micro/micro_error_reporter.h
index 0ab853ec2ac915a8eb3da87eb8b86f2ecec697c7..6c18367c95fc9f07eb67b90a0e736b64271d9291 100644
--- a/tensorflow/lite/experimental/micro/micro_error_reporter.h
+++ b/tensorflow/lite/experimental/micro/micro_error_reporter.h
@@ -17,26 +17,8 @@ limitations under the License.
 
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/experimental/micro/compatibility.h"
-
-#ifdef TF_LITE_MCU_DEBUG_LOG
-// These functions should be supplied by the micro target library
-extern "C" {
-#include <stdint.h>
-void DebugLog(const char* s);
-void DebugLogInt32(int32_t i);
-void DebugLogUInt32(uint32_t i);
-void DebugLogHex(uint32_t i);
-void DebugLogFloat(float i);
-}
-#else  // TF_LITE_MCU_DEBUG_LOG
-#include <cstdint>
-#include <cstdio>
-static void inline DebugLog(const char* s) { fprintf(stderr, "%s", s); }
-static void inline DebugLogInt32(int32_t i) { fprintf(stderr, "%d", i); }
-static void inline DebugLogUInt32(uint32_t i) { fprintf(stderr, "%d", i); }
-static void inline DebugLogHex(uint32_t i) { fprintf(stderr, "0x%8x", i); }
-static void inline DebugLogFloat(float i) { fprintf(stderr, "%f", i); }
-#endif  // TF_LITE_MCU_DEBUG_LOG
+#include "tensorflow/lite/experimental/micro/debug_log.h"
+#include "tensorflow/lite/experimental/micro/debug_log_numbers.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/experimental/micro/riscv32_mcu/README.md b/tensorflow/lite/experimental/micro/riscv32_mcu/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5477d7ae951cbd8c47312f51acdea16d87f5f910
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/riscv32_mcu/README.md
@@ -0,0 +1,7 @@
+# RISC-V MCU
+
+This folder contains TFLite kernel operations optimized for RISC-V micro
+controllers.
+
+It is designed to be portable even to 'bare metal', so it follows the same
+design goals as the micro experimental port.
diff --git a/tensorflow/lite/experimental/micro/riscv32_mcu/debug_log.cc b/tensorflow/lite/experimental/micro/riscv32_mcu/debug_log.cc
index f0065267bfaa75bf1b0e271595280d49f2f2abe9..d1c2df866e9f8e4c99aabcc7fe73e4879b079b42 100644
--- a/tensorflow/lite/experimental/micro/riscv32_mcu/debug_log.cc
+++ b/tensorflow/lite/experimental/micro/riscv32_mcu/debug_log.cc
@@ -15,176 +15,4 @@ limitations under the License.
 
 #include <stdio.h>
 
-namespace {
-
-// All input buffers to the number conversion functions must be this long.
-static const int kFastToBufferSize = 48;
-
-// Reverses a zero-terminated string in-place.
-char* ReverseStringInPlace(char* start, char* end) {
-  char* p1 = start;
-  char* p2 = end - 1;
-  while (p1 < p2) {
-    char tmp = *p1;
-    *p1++ = *p2;
-    *p2-- = tmp;
-  }
-  return start;
-}
-
-// Appends a string to a string, in-place. You need to pass in the maximum
-// string length as the second argument.
-char* StrCatStr(char* main, int main_max_length, char* to_append) {
-  char* current = main;
-  while (*current != 0) {
-    ++current;
-  }
-  char* current_end = main + (main_max_length - 1);
-  while ((*to_append != 0) && (current < current_end)) {
-    *current = *to_append;
-    ++current;
-    ++to_append;
-  }
-  *current = 0;
-  return current;
-}
-
-char* StrCpy(char* main, int main_max_length, const char* source) {
-  char* current = main;
-  char* current_end = main + (main_max_length - 1);
-  while ((*source != 0) && (current < current_end)) {
-    *current = *source;
-    ++current;
-    ++source;
-  }
-  *current = 0;
-  return current;
-}
-
-// Populates the provided buffer with an ASCII representation of the number.
-char* FastUInt32ToBufferLeft(uint32_t i, char* buffer, int base) {
-  char* start = buffer;
-  do {
-    int32_t digit = i % base;
-    char character;
-    if (digit < 10) {
-      character = '0' + digit;
-    } else {
-      character = 'a' + (digit - 10);
-    }
-    *buffer++ = character;
-    i /= base;
-  } while (i > 0);
-  *buffer = 0;
-  ReverseStringInPlace(start, buffer);
-  return buffer;
-}
-
-// Populates the provided buffer with an ASCII representation of the number.
-char* FastInt32ToBufferLeft(int32_t i, char* buffer) {
-  uint32_t u = i;
-  if (i < 0) {
-    *buffer++ = '-';
-    u = -u;
-  }
-  return FastUInt32ToBufferLeft(u, buffer, 10);
-}
-
-// Converts a number to a string and appends it to another.
-char* StrCatInt32(char* main, int main_max_length, int32_t number) {
-  char number_string[kFastToBufferSize];
-  FastInt32ToBufferLeft(number, number_string);
-  return StrCatStr(main, main_max_length, number_string);
-}
-
-// Converts a number to a string and appends it to another.
-char* StrCatUInt32(char* main, int main_max_length, uint32_t number, int base) {
-  char number_string[kFastToBufferSize];
-  FastUInt32ToBufferLeft(number, number_string, base);
-  return StrCatStr(main, main_max_length, number_string);
-}
-
-// Populates the provided buffer with ASCII representation of the float number.
-// Avoids the use of any floating point instructions (since these aren't
-// supported on many microcontrollers) and as a consequence prints values with
-// power-of-two exponents.
-char* FastFloatToBufferLeft(float i, char* buffer) {
-  char* current = buffer;
-  char* current_end = buffer + (kFastToBufferSize - 1);
-  // Access the bit fields of the floating point value to avoid requiring any
-  // float instructions. These constants are derived from IEEE 754.
-  const uint32_t sign_mask = 0x80000000;
-  const uint32_t exponent_mask = 0x7f800000;
-  const int32_t exponent_shift = 23;
-  const int32_t exponent_bias = 127;
-  const uint32_t fraction_mask = 0x007fffff;
-  const uint32_t u = *(uint32_t*)(&i);
-  const int32_t exponent =
-      ((u & exponent_mask) >> exponent_shift) - exponent_bias;
-  const uint32_t fraction = (u & fraction_mask);
-  // Expect ~0x2B1B9D3 for fraction.
-  if (u & sign_mask) {
-    *current = '-';
-    current += 1;
-  }
-  *current = 0;
-  // These are special cases for infinities and not-a-numbers.
-  if (exponent == 128) {
-    if (fraction == 0) {
-      current = StrCatStr(current, (current_end - current), "Inf");
-      return current;
-    } else {
-      current = StrCatStr(current, (current_end - current), "NaN");
-      return current;
-    }
-  }
-  // 0x007fffff represents 0.99... for the fraction, so to print the correct
-  // decimal digits we need to scale our value before passing it to the
-  // conversion function. This scale should be 10000000/8388608 = 1.1920928955.
-  // We can approximate this using multipy-adds and right-shifts using the
-  // values in this array.
-  const int32_t scale_shifts_size = 13;
-  const int8_t scale_shifts[13] = {3,  4,  8,  11, 13, 14, 17,
-                                   18, 19, 20, 21, 22, 23};
-  uint32_t scaled_fraction = fraction;
-  for (int i = 0; i < scale_shifts_size; ++i) {
-    scaled_fraction += (fraction >> scale_shifts[i]);
-  }
-  *current = '1';
-  current += 1;
-  *current = '.';
-  current += 1;
-  *current = 0;
-  current = StrCatUInt32(current, (current_end - current), scaled_fraction, 10);
-  current = StrCatStr(current, (current_end - current), "*2^");
-  current = StrCatInt32(current, (current_end - current), exponent);
-  return current;
-}
-
-}  // namespace
-
 extern "C" void DebugLog(const char* s) { puts(s); }
-
-extern "C" void DebugLogInt32(int32_t i) {
-  char number_string[kFastToBufferSize];
-  FastInt32ToBufferLeft(i, number_string);
-  DebugLog(number_string);
-}
-
-extern "C" void DebugLogUInt32(uint32_t i) {
-  char number_string[kFastToBufferSize];
-  FastUInt32ToBufferLeft(i, number_string, 10);
-  DebugLog(number_string);
-}
-
-extern "C" void DebugLogHex(uint32_t i) {
-  char number_string[kFastToBufferSize];
-  FastUInt32ToBufferLeft(i, number_string, 16);
-  DebugLog(number_string);
-}
-
-extern "C" void DebugLogFloat(float i) {
-  char number_string[kFastToBufferSize];
-  FastFloatToBufferLeft(i, number_string);
-  DebugLog(number_string);
-}
diff --git a/tensorflow/lite/experimental/micro/testing/BUILD b/tensorflow/lite/experimental/micro/testing/BUILD
index 5a31a709ca3f0205b8764528d6e8f2c0fe0f93d0..1623df5b8650a34aa900cb6d362e444bc640fc8e 100644
--- a/tensorflow/lite/experimental/micro/testing/BUILD
+++ b/tensorflow/lite/experimental/micro/testing/BUILD
@@ -10,8 +10,10 @@ cc_library(
     name = "micro_test",
     hdrs = [
         "micro_test.h",
+        "test_utils.h",
     ],
     deps = [
+        "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
     ],
 )
diff --git a/tensorflow/lite/experimental/micro/kernels/test_utils.h b/tensorflow/lite/experimental/micro/testing/test_utils.h
similarity index 91%
rename from tensorflow/lite/experimental/micro/kernels/test_utils.h
rename to tensorflow/lite/experimental/micro/testing/test_utils.h
index 95f2d8a9d217a1b1f23c0198ddce5156e1c6cb36..e37eaf46e0815087cdc48c6aa23353f6f1cf9d7f 100644
--- a/tensorflow/lite/experimental/micro/kernels/test_utils.h
+++ b/tensorflow/lite/experimental/micro/testing/test_utils.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_KERNELS_TEST_UTILS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_KERNELS_TEST_UTILS_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_TESTING_TEST_UTILS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_TESTING_TEST_UTILS_H_
 
 #include <cstdarg>
 #include <initializer_list>
@@ -21,8 +21,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/experimental/micro/kernels/test_utils.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace tflite {
@@ -164,7 +163,20 @@ inline TfLiteTensor CreateQuantized32Tensor(std::initializer_list<int32_t> data,
   return CreateQuantized32Tensor(data.begin(), dims, name, min, max);
 }
 
+// Do a simple string comparison for testing purposes, without requiring the
+// standard C library.
+inline int TestStrcmp(const char* a, const char* b) {
+  if ((a == nullptr) || (b == nullptr)) {
+    return -1;
+  }
+  while ((*a != 0) && (*a == *b)) {
+    a++;
+    b++;
+  }
+  return *(const unsigned char*)a - *(const unsigned char*)b;
+}
+
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_KERNELS_TEST_UTILS_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_TESTING_TEST_UTILS_H_
diff --git a/tensorflow/lite/experimental/micro/tools/make/.gitignore b/tensorflow/lite/experimental/micro/tools/make/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..752f078fb56ca734056d694d0528943a82a8ef3e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/.gitignore
@@ -0,0 +1,2 @@
+downloads
+gen
diff --git a/tensorflow/lite/experimental/micro/tools/make/Makefile b/tensorflow/lite/experimental/micro/tools/make/Makefile
index 438eae647847188229fb56c351b64a8e652400e7..fde195118b18ca308940292c7bd5706ecace8563 100644
--- a/tensorflow/lite/experimental/micro/tools/make/Makefile
+++ b/tensorflow/lite/experimental/micro/tools/make/Makefile
@@ -1,5 +1,9 @@
+
 MAKEFILE_DIR := tensorflow/lite/experimental/micro/tools/make
 
+# Pull in some convenience functions.
+include $(MAKEFILE_DIR)/helper_functions.inc
+
 # Try to figure out the host system
 HOST_OS :=
 ifeq ($(OS),Windows_NT)
@@ -21,6 +25,11 @@ HOST_ARCH := $(shell if [[ $(shell uname -m) =~ i[345678]86 ]]; then echo x86_32
 TARGET := $(HOST_OS)
 TARGET_ARCH := $(HOST_ARCH)
 
+# Specify TAGS on the command line to add a particular set of specialized
+# implementations, for example TAGS="CMSIS disco_f746ng" to target a Discovery
+# STM32F746NG board, using the CMSIS library's implementations where possible.
+ALL_TAGS := $(TAGS) $(TARGET)
+
 INCLUDES := \
 -I. \
 -I$(MAKEFILE_DIR)/../../../../../ \
@@ -53,33 +62,13 @@ CC_PREFIX :=
 # runtime that can be linked in to other programs.
 MICROLITE_LIB_NAME := libtensorflow-microlite.a
 
-# Test binary for the microcontroller speech model.
-MICRO_SPEECH_TEST_SRCS := \
-tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc
-
-# Test binary for the microcontroller speech model.
-PREPROCESSOR_TEST_SRCS := \
-tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc
-
-PREPROCESSOR_REFERENCE_TEST_SRCS = \
-$(PREPROCESSOR_TEST_SRCS) \
-tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
-
-PREPROCESSOR_FIXED_TEST_SRCS += \
-$(PREPROCESSOR_TEST_SRCS) \
-tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
-
 MICROLITE_TEST_SRCS := \
 $(wildcard tensorflow/lite/experimental/micro/*test.cc) \
 $(wildcard tensorflow/lite/experimental/micro/kernels/*test.cc)
 
+MICROLITE_TEST_HDRS := \
+$(wildcard tensorflow/lite/experimental/micro/testing/*.h)
+
 MICROLITE_CC_BASE_SRCS := \
 $(wildcard tensorflow/lite/experimental/micro/*.cc) \
 $(wildcard tensorflow/lite/experimental/micro/kernels/*.cc) \
@@ -90,6 +79,51 @@ tensorflow/lite/core/api/op_resolver.cc \
 tensorflow/lite/kernels/kernel_util.cc \
 tensorflow/lite/kernels/internal/quantization_util.cc
 MICROLITE_CC_SRCS := $(filter-out $(MICROLITE_TEST_SRCS), $(MICROLITE_CC_BASE_SRCS))
+MICROLITE_CC_SRCS := $(call specialize,$(MICROLITE_CC_SRCS))
+
+MICROLITE_CC_HDRS := \
+$(wildcard tensorflow/lite/experimental/micro/*.h) \
+$(wildcard tensorflow/lite/experimental/micro/kernels/*.h) \
+LICENSE \
+tensorflow/lite/c/c_api_internal.h \
+tensorflow/lite/c/builtin_op_data.h \
+tensorflow/lite/core/api/error_reporter.h \
+tensorflow/lite/core/api/flatbuffer_conversions.h \
+tensorflow/lite/core/api/op_resolver.h \
+tensorflow/lite/kernels/kernel_util.h \
+tensorflow/lite/kernels/op_macros.h \
+tensorflow/lite/kernels/padding.h \
+tensorflow/lite/kernels/internal/common.h \
+tensorflow/lite/kernels/internal/compatibility.h \
+tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h \
+tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h \
+tensorflow/lite/kernels/internal/reference/fully_connected.h \
+tensorflow/lite/kernels/internal/reference/softmax.h \
+tensorflow/lite/kernels/internal/round.h \
+tensorflow/lite/kernels/internal/tensor_ctypes.h \
+tensorflow/lite/kernels/internal/types.h \
+tensorflow/lite/kernels/internal/quantization_util.h \
+tensorflow/lite/schema/schema_generated.h \
+tensorflow/lite/version.h
+
+THIRD_PARTY_CC_HDRS := \
+third_party/gemmlowp/fixedpoint/fixedpoint.h \
+third_party/gemmlowp/fixedpoint/fixedpoint_sse.h \
+third_party/gemmlowp/internal/detect_platform.h \
+third_party/gemmlowp/LICENSE \
+third_party/flatbuffers/include/flatbuffers/base.h \
+third_party/flatbuffers/include/flatbuffers/stl_emulation.h \
+third_party/flatbuffers/include/flatbuffers/flatbuffers.h \
+third_party/flatbuffers/LICENSE.txt
+
+MAKE_PROJECT_FILES := \
+  README_MAKE.md \
+  Makefile
+
+MBED_PROJECT_FILES := \
+  README_MBED.md \
+  mbed-os.lib \
+  mbed_app.json
 
 # These target-specific makefiles should modify or replace options like
 # CXXFLAGS or LIBS to work for a specific targetted architecture. All logic
@@ -97,10 +131,9 @@ MICROLITE_CC_SRCS := $(filter-out $(MICROLITE_TEST_SRCS), $(MICROLITE_CC_BASE_SR
 # keep this main makefile focused on the sources and dependencies.
 include $(wildcard $(MAKEFILE_DIR)/targets/*_makefile.inc)
 
+ALL_TAGS += $(TARGET_ARCH)
+
 ALL_SRCS := \
-	$(MICRO_SPEECH_TEST_SRCS) \
-	$(PREPROCESSOR_REFERENCE_TEST_SRCS) \
-	$(PREPROCESSOR_FIXED_TEST_SRCS) \
 	$(MICROLITE_CC_SRCS) \
 	$(MICROLITE_TEST_SRCS)
 
@@ -109,25 +142,16 @@ GENDIR := $(MAKEFILE_DIR)/gen/$(TARGET)_$(TARGET_ARCH)/
 OBJDIR := $(GENDIR)obj/
 BINDIR := $(GENDIR)bin/
 LIBDIR := $(GENDIR)lib/
+PRJDIR := $(GENDIR)prj/
 
 MICROLITE_LIB_PATH := $(LIBDIR)$(MICROLITE_LIB_NAME)
 
-MICRO_SPEECH_TEST_BINARY := $(BINDIR)micro_speech_test
-PREPROCESSOR_REFERENCE_TEST_BINARY := $(BINDIR)preprocessor_reference_test
-PREPROCESSOR_FIXED_TEST_BINARY := $(BINDIR)preprocessor_fixed_test
-
 CXX := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}g++
 CC := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}gcc
 AR := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}ar
 
-MICRO_SPEECH_TEST_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(patsubst %.S,%.o,$(MICRO_SPEECH_TEST_SRCS)))))
-
-PREPROCESSOR_REFERENCE_TEST_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_REFERENCE_TEST_SRCS))))
-
-PREPROCESSOR_FIXED_TEST_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_FIXED_TEST_SRCS))))
+# Load the examples.
+include $(wildcard tensorflow/lite/experimental/micro/examples/*/Makefile.inc)
 
 MICROLITE_LIB_OBJS := $(addprefix $(OBJDIR), \
 $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICROLITE_CC_SRCS))))
@@ -145,13 +169,13 @@ $(OBJDIR)%.o: %.c
 	@mkdir -p $(dir $@)
 	$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
 
-  # For normal manually-created TensorFlow ASM source files.
+# For normal manually-created TensorFlow ASM source files.
 $(OBJDIR)%.o: %.S
 	@mkdir -p $(dir $@)
 	$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
 
 # The target that's compiled if there's no command-line arguments.
-all: $(MICROLITE_LIB_PATH) $(MICRO_SPEECH_TEST_BINARY) $(PREPROCESSOR_TEST_BINARY)
+all: $(MICROLITE_LIB_PATH)
 
 microlite: $(MICROLITE_LIB_PATH)
 
@@ -164,42 +188,6 @@ $(MICROLITE_LIB_PATH): tensorflow/lite/schema/schema_generated.h $(MICROLITE_LIB
 	@mkdir -p $(dir $@)
 	$(AR) $(ARFLAGS) $(MICROLITE_LIB_PATH) $(MICROLITE_LIB_OBJS)
 
-$(MICRO_SPEECH_TEST_BINARY): $(MICRO_SPEECH_TEST_OBJS) $(MICROLITE_LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(MICRO_SPEECH_TEST_BINARY) $(MICRO_SPEECH_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
-
-micro_speech_test: $(MICRO_SPEECH_TEST_BINARY)
-micro_speech_test_bin: $(MICRO_SPEECH_TEST_BINARY).bin
-
-test_micro_speech: $(MICRO_SPEECH_TEST_BINARY)
-	$(TEST_SCRIPT) $(MICRO_SPEECH_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
-
-$(PREPROCESSOR_REFERENCE_TEST_BINARY): $(PREPROCESSOR_REFERENCE_TEST_OBJS) $(MICROLITE_LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(PREPROCESSOR_REFERENCE_TEST_BINARY) $(PREPROCESSOR_REFERENCE_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
-
-preprocessor_reference_test: $(PREPROCESSOR_REFERENCE_TEST_BINARY)
-preprocessor_reference_test_bin: $(PREPROCESSOR_REFERENCE_TEST_BINARY).bin
-
-test_preprocessor_reference: $(PREPROCESSOR_REFERENCE_TEST_BINARY)
-	$(TEST_SCRIPT) $(PREPROCESSOR_REFERENCE_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
-
-$(PREPROCESSOR_FIXED_TEST_BINARY): $(PREPROCESSOR_FIXED_TEST_OBJS) $(MICROLITE_LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(PREPROCESSOR_FIXED_TEST_BINARY) $(PREPROCESSOR_FIXED_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
-
-preprocessor_fixed_test: $(PREPROCESSOR_FIXED_TEST_BINARY)
-preprocessor_fixed_test_bin: $(PREPROCESSOR_FIXED_TEST_BINARY).bin
-
-test_preprocessor_fixed: $(PREPROCESSOR_FIXED_TEST_BINARY)
-	$(TEST_SCRIPT) $(PREPROCESSOR_FIXED_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
-
 $(BINDIR)%_test : $(OBJDIR)%_test.o $(MICROLITE_LIB_PATH)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
@@ -209,7 +197,9 @@ $(BINDIR)%_test : $(OBJDIR)%_test.o $(MICROLITE_LIB_PATH)
 $(BINDIR)%.test_target: $(BINDIR)%_test
 	$(TEST_SCRIPT) $< '~~~ALL TESTS PASSED~~~'
 
-$(info $(MICROLITE_TEST_TARGETS))
+# Generate standalone makefile projects for all of the test targets.
+$(foreach TEST_TARGET,$(MICROLITE_TEST_SRCS),\
+$(eval $(call microlite_test,$(notdir $(basename $(TEST_TARGET))),$(TEST_TARGET))))
 
 test: test_micro_speech $(MICROLITE_TEST_TARGETS)
 
diff --git a/tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh b/tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
index 1661695225ef6be0ddff420813adb8e84761c675..82c15e32f6572f36588945431918cf75299d3a64 100755
--- a/tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
+++ b/tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
@@ -35,6 +35,9 @@ CMSIS_URL="https://github.com/ARM-software/CMSIS_5/archive/5.4.0.zip"
 STM32_BARE_LIB_URL="https://github.com/google/stm32_bare_lib/archive/c07d611fb0af58450c5a3e0ab4d52b47f99bc82d.zip"
 SIFIVE_FE310_LIB_URL="https://github.com/sifive/freedom-e-sdk/archive/baeeb8fd497a99b3c141d7494309ec2e64f19bdf.zip"
 RISCV_TOOLCHAIN_URL="https://static.dev.sifive.com/dev-tools/riscv64-unknown-elf-gcc-20181030-x86_64-linux-ubuntu14.tar.gz"
+AP3_URL="https://github.com/AmbiqMicro/TFLiteMicro_Apollo3/archive/dfbcef9a57276c087d95aab7cb234f1d4c9eaaba.zip"
+CUST_CMSIS_URL="https://github.com/AmbiqMicro/TFLiteMicro_CustCMSIS/archive/8f63966c5692e6a3a83956efd2e4aed77c4c9949.zip"
+GCC_EMBEDDED_URL="https://developer.arm.com/-/media/Files/downloads/gnu-rm/7-2018q2/gcc-arm-none-eabi-7-2018-q2-update-linux.tar.bz2"
 
 download_and_extract() {
   local usage="Usage: download_and_extract URL DIR"
@@ -44,6 +47,8 @@ download_and_extract() {
   mkdir -p "${dir}"
   if [[ "${url}" == *gz ]]; then
     curl -Ls "${url}" | tar -C "${dir}" --strip-components=1 -xz
+  elif [[ "${url}" == *bz2 ]]; then
+    curl -Ls "${url}" | tar -C "${dir}" --strip-components=1 -xj
   elif [[ "${url}" == *zip ]]; then
     tempdir=$(mktemp -d)
     tempdir2=$(mktemp -d)
@@ -67,11 +72,37 @@ download_and_extract() {
   find "${dir}" -type f -name '*BUILD' -delete
 }
 
+patch_apollo3_sdk() {
+  local ap3_dir="${1}"
+  if [ ! -f ${ap3_dir}/VERSION.txt ]; then
+    echo "Could not find ${ap3_dir}, skipping Apollo3 SDK";
+    return;
+  fi
+  local src_dir=${ap3_dir}/boards/apollo3_evb/examples/hello_world/gcc
+  local dest_dir=${ap3_dir}/boards/apollo3_evb/examples/hello_world/gcc_patched
+  rm -rf ${dest_dir}
+  mkdir ${dest_dir}
+  cp "${src_dir}/startup_gcc.c" "${dest_dir}/startup_gcc.c"
+  cp "${src_dir}/hello_world.ld" "${dest_dir}/apollo3evb.ld"
+  sed -i -e '131s/1024/1024\*20/g' "${dest_dir}/startup_gcc.c"
+  sed -i -e 's/main/_main/g' "${dest_dir}/startup_gcc.c"
+  sed -i -e '3s/hello_world.ld/apollo3evb.ld/g' "${dest_dir}/apollo3evb.ld"
+  sed -i -e '3s/startup_gnu/startup_gcc/g' "${dest_dir}/apollo3evb.ld"
+  sed -i -e '6s/am_reset_isr/Reset_Handler/g' "${dest_dir}/apollo3evb.ld"
+  sed -i -e '22s/\*(.text\*)/\*(.text\*)\n\n\t\/\* These are the C++ global constructors.  Stick them all here and\n\t \* then walk through the array in main() calling them all.\n\t \*\/\n\t_init_array_start = .;\n\tKEEP (\*(SORT(.init_array\*)))\n\t_init_array_end = .;\n\n\t\/\* XXX Currently not doing anything for global destructors. \*\/\n/g' "${dest_dir}/apollo3evb.ld"
+  sed -i -e "70s/} > SRAM/} > SRAM\n    \/\* Add this to satisfy reference to symbol 'end' from libnosys.a(sbrk.o)\n     \* to denote the HEAP start.\n     \*\/\n   end = .;/g" "${dest_dir}/apollo3evb.ld"
+  echo "Finished preparing Apollo3 files"
+}
+
 download_and_extract "${GEMMLOWP_URL}" "${DOWNLOADS_DIR}/gemmlowp"
 download_and_extract "${FLATBUFFERS_URL}" "${DOWNLOADS_DIR}/flatbuffers"
 download_and_extract "${CMSIS_URL}" "${DOWNLOADS_DIR}/cmsis"
 download_and_extract "${STM32_BARE_LIB_URL}" "${DOWNLOADS_DIR}/stm32_bare_lib"
 download_and_extract "${SIFIVE_FE310_LIB_URL}" "${DOWNLOADS_DIR}/sifive_fe310_lib"
 download_and_extract "${RISCV_TOOLCHAIN_URL}" "${DOWNLOADS_DIR}/riscv_toolchain"
+download_and_extract "${AP3_URL}" "${DOWNLOADS_DIR}/apollo3_ext"
+patch_apollo3_sdk "${DOWNLOADS_DIR}/Apollo3-SDK-2018.08.13"
+download_and_extract "${CUST_CMSIS_URL}" "${DOWNLOADS_DIR}/CMSIS_ext"
+download_and_extract "${GCC_EMBEDDED_URL}" "${DOWNLOADS_DIR}/gcc_embedded"
 
 echo "download_dependencies.sh completed successfully." >&2
diff --git a/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc b/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc
new file mode 100644
index 0000000000000000000000000000000000000000..87c002635f55be334bbb22a892a3013e92087cc2
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc
@@ -0,0 +1,117 @@
+
+# Reverses a space-separated list of words.
+reverse = $(if $(1),$(call reverse,$(wordlist 2,$(words $(1)),$(1)))) $(firstword $(1))
+
+# Look for platform or target-specific implementation files to replace reference
+# implementations with, given a tag. These are expected to occur in subfolders
+# of a directory where a reference implementation exists, and have the same
+# interface and header file. For example,
+# tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
+# defines a module for supplying audio data, but since no platform or OS can be
+# presumed, it just always returns zeroes for its samples. The MacOS-specific
+# tensorflow/lite/experimental/micro/examples/micro_speech/osx/audio_provider.cc
+# has an implementation that relies on CoreAudio, and there are equivalent
+# versions for other operating systems.
+# The specific implementation yielded by the first tag in the list that produces
+# a match is returned, else the reference version if none of the tags produce a
+# match.
+# All lists of source files are put through this substitution process with the
+# tags of their target OS and architecture, so that implementations can be added
+# by simply placing them in the file tree, with no changes to the build files
+# needed.
+# One confusing thing about this implementation is that we're using wildcard to
+# act as a 'does file exist?' function, rather than expanding an expression.
+# Wildcard will return an empty string if given a plain file path with no actual
+# wildcards, if the file doesn't exist, so taking the first word of the list
+# between that and the reference path will pick the specialized one if it's
+# available.
+substitute_specialized_implementation = \
+  $(firstword $(wildcard $(dir $(1))$(2)/$(notdir $(1))) $(wildcard $(1)))
+substitute_specialized_implementations = \
+  $(foreach source,$(1),$(call substitute_specialized_implementation,$(source),$(2)))
+# Here we're first looking for specialized implementations in ref_dir/$(TAG1)
+# and then ref_dir/$(TAG2), etc, before falling back to ref_dir's
+# implementation.
+# The argument to this function should be a list of space-separated file paths,
+# with any wildcards already expanded.
+define specialize_on_tags
+$(if $(2),$(call substitute_specialized_implementations,$(call specialize_on_tags,$(1),$(wordlist 2,$(words $(2)),$(2))),$(firstword $(2))),$(1))
+endef
+# The entry point that most targets should use to find implementation-specific
+# versions of their source files. The only argument is a list of file paths.
+specialize = $(call specialize_on_tags,$(1),$(strip $(call reverse,$(ALL_TAGS))))
+
+# Creates a set of rules to build a standalone makefile project for an
+# executable, including all of the source and header files required in a
+# separate folder and a simple makefile.
+# Arguments are:
+# 1 - Project type (make, mbed, etc).
+# 2 - Project file template name.
+# 3 - Name of executable.
+# 4 - List of C/C++ source files needed to build the target.
+# 5 - List of C/C++ header files needed to build the target.
+# 6 - Linker flags required.
+# 7 - C++ compilation flags needed.
+# Calling eval on the output will create a <Name>_makefile target that you
+# can invoke to create the standalone project.
+define generate_project
+$(PRJDIR)$(3)/$(1)/%: %
+	@mkdir -p $$(dir $$@)
+	cp $$< $$@
+
+$(PRJDIR)$(3)/$(1)/third_party/%: tensorflow/lite/experimental/micro/tools/make/downloads/%
+	@mkdir -p $$(dir $$@)
+	cp $$< $$@
+
+$(PRJDIR)$(3)/$(1)/%: tensorflow/lite/experimental/micro/tools/make/templates/%.tpl
+	@mkdir -p $$(dir $$@)
+	sed -E 's#\%\{SRCS\}\%#$(4)#g' $$< | \
+	sed -E 's#\%\{EXECUTABLE\}\%#$(3)#g' | \
+	sed -E 's#\%\{LINKER_FLAGS\}\%#$(6)#g' | \
+	sed -E 's#\%\{CXX_FLAGS\}\%#$(7)#g' > $$@
+
+generate_$(3)_$(1)_project: $(addprefix $(PRJDIR)$(3)/$(1)/, $(4) $(5) $(2))
+endef
+
+# Specialized version of generate_project for TF Lite Micro test targets that
+# automatically includes standard library files, so you just need to pass the
+# test name and any extra source files required.
+# Arguments are:
+# 1 - Name of test.
+# 2 - C/C++ source files implementing the test.
+# 3 - C/C++ header files needed for the test.
+# Calling eval on the output will create targets that you can invoke to
+# generate the standalone project.
+define generate_microlite_projects
+$(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(MICROLITE_LIBS),$(CXXFLAGS))
+$(call generate_project,mbed,$(MBED_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(MICROLITE_LIBS),$(CXXFLAGS))
+endef
+
+
+# Handles the details of generating a binary target, including specializing
+# for the current platform, and generating project file targets.
+# Arguments are:
+# 1 - Name of test.
+# 2 - C/C++ source files implementing the test.
+# 3 - C/C++ header files needed for the test.
+# Calling eval on the output will create the targets that you need.
+define microlite_test
+$(1)_LOCAL_SRCS := $(2)
+$(1)_LOCAL_SRCS := $$(call specialize,$$($(1)_LOCAL_SRCS))
+ALL_SRCS += $$($(1)_LOCAL_SRCS)
+$(1)_LOCAL_HDRS := $(3)
+$(1)_LOCAL_OBJS := $$(addprefix $$(OBJDIR), \
+$$(patsubst %.cc,%.o,$$(patsubst %.c,%.o,$$($(1)_LOCAL_SRCS))))
+$(1)_BINARY := $$(BINDIR)$(1)
+ALL_BINARIES += $$($(1)_BINARY)
+$$($(1)_BINARY): $$($(1)_LOCAL_OBJS) $$(MICROLITE_LIB_PATH)
+	@mkdir -p $$(dir $$@)
+	$$(CXX) $$(CXXFLAGS) $$(INCLUDES) \
+	-o $$($(1)_BINARY) $$($(1)_LOCAL_OBJS) \
+	$$(LIBFLAGS) $$(MICROLITE_LIB_PATH) $$(LDFLAGS) $$(MICROLITE_LIBS)
+$(1): $$($(1)_BINARY)
+$(1)_bin: $$($(1)_BINARY).bin
+test_$(1): $$($(1)_BINARY)
+	$$(TEST_SCRIPT) $$($(1)_BINARY) '~~~ALL TESTS PASSED~~~'
+$(eval $(call generate_microlite_projects,$(1),$(call specialize,$(2)),$(3)))
+endef
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/.gitignore b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..cb646e29d9ab950e7697b284cc5a87a302397219
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/.gitignore
@@ -0,0 +1,4 @@
+startup_gcc.c
+am_*.c
+libam*.a
+
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/_main.c b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/_main.c
deleted file mode 100644
index bd238ac55f96dbe62aa16a92180a5995ce395945..0000000000000000000000000000000000000000
--- a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/_main.c
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <stdint.h>
-#include "am_mcu_apollo.h"              // Defines AM_CMSIS_REGS
-#include "am_bsp.h"
-#include "am_util.h"
-
-//*****************************************************************************
-//
-// The entry point for the application.
-//
-//*****************************************************************************
-extern int main(int argc, char**argv);
-
-void DebugLog(const char* s) { am_util_stdio_printf( "%s", s); }
-void DebugLogInt32(int32_t i) { am_util_stdio_printf( "%d", i); }
-void DebugLogUInt32(uint32_t i) { am_util_stdio_printf( "%d", i); }
-void DebugLogHex(uint32_t i) { am_util_stdio_printf( "0x%8x", i); }
-void DebugLogFloat(float i) { am_util_stdio_printf( "%f", i); }
-
-int _main(void)
-{
-    am_util_id_t sIdDevice;
-    uint32_t ui32StrBuf;
-
-    //
-    // Set the clock frequency.
-    //
-    am_hal_clkgen_control(AM_HAL_CLKGEN_CONTROL_SYSCLK_MAX, 0);
-
-    //
-    // Set the default cache configuration
-    //
-    am_hal_cachectrl_config(&am_hal_cachectrl_defaults);
-    am_hal_cachectrl_enable();
-
-    //
-    // Configure the board for low power operation.
-    //
-    am_bsp_low_power_init();
-
-    //
-    // Initialize the printf interface for UART output
-    //
-    am_bsp_uart_printf_enable();
-
-    //
-    // Print the banner.
-    //
-    am_util_stdio_terminal_clear();
-    am_util_stdio_printf("Hello World!\n\n");
-
-    //
-    // Print the device info.
-    //
-    am_util_id_device(&sIdDevice);
-    am_util_stdio_printf("Vendor Name: %s\n", sIdDevice.pui8VendorName);
-    am_util_stdio_printf("Device type: %s\n", sIdDevice.pui8DeviceName);
-
-
-    am_util_stdio_printf("Qualified: %s\n",
-                         sIdDevice.sMcuCtrlDevice.ui32Qualified ?
-                         "Yes" : "No");
-
-    am_util_stdio_printf("Device Info:\n"
-                         "\tPart number: 0x%08X\n"
-                         "\tChip ID0:    0x%08X\n"
-                         "\tChip ID1:    0x%08X\n"
-                         "\tRevision:    0x%08X (Rev%c%c)\n",
-                         sIdDevice.sMcuCtrlDevice.ui32ChipPN,
-                         sIdDevice.sMcuCtrlDevice.ui32ChipID0,
-                         sIdDevice.sMcuCtrlDevice.ui32ChipID1,
-                         sIdDevice.sMcuCtrlDevice.ui32ChipRev,
-                         sIdDevice.ui8ChipRevMaj, sIdDevice.ui8ChipRevMin );
-
-    //
-    // If not a multiple of 1024 bytes, append a plus sign to the KB.
-    //
-    ui32StrBuf = ( sIdDevice.sMcuCtrlDevice.ui32FlashSize % 1024 ) ? '+' : 0;
-    am_util_stdio_printf("\tFlash size:  %7d (%d KB%s)\n",
-                         sIdDevice.sMcuCtrlDevice.ui32FlashSize,
-                         sIdDevice.sMcuCtrlDevice.ui32FlashSize / 1024,
-                         &ui32StrBuf);
-
-    ui32StrBuf = ( sIdDevice.sMcuCtrlDevice.ui32SRAMSize % 1024 ) ? '+' : 0;
-    am_util_stdio_printf("\tSRAM size:   %7d (%d KB%s)\n\n",
-                         sIdDevice.sMcuCtrlDevice.ui32SRAMSize,
-                         sIdDevice.sMcuCtrlDevice.ui32SRAMSize / 1024,
-                         &ui32StrBuf);
-
-    //
-    // Print the compiler version.
-    //
-    am_util_stdio_printf("App Compiler:    %s\n", COMPILER_VERSION);
-#ifdef AM_PART_APOLLO3
-    am_util_stdio_printf("HAL Compiler:    %s\n", g_ui8HALcompiler);
-    am_util_stdio_printf("HAL SDK version: %d.%d.%d\n",
-                         g_ui32HALversion.s.Major,
-                         g_ui32HALversion.s.Minor,
-                         g_ui32HALversion.s.Revision);
-    am_util_stdio_printf("HAL compiled with %s-style registers\n",
-                         g_ui32HALversion.s.bAMREGS ? "AM_REG" : "CMSIS");
-
-    am_util_stdio_printf("&sIdDevice: 0x%x, &ui32StrBuf: 0x%x\n", &sIdDevice, &ui32StrBuf);
-    am_hal_security_info_t secInfo;
-    char sINFO[32];
-    uint32_t ui32Status;
-#endif // AM_PART_APOLLO3
-    main(0, NULL);
-}
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/prep_apollo3_files.sh b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/prep_apollo3_files.sh
new file mode 100755
index 0000000000000000000000000000000000000000..7ef23095022b24922b28580ce3e8d1c76b81086f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/prep_apollo3_files.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+AP3_DIR="tensorflow/lite/experimental/micro/tools/make/downloads/Apollo3-SDK-2018.08.13"
+if [ ! -d $AP3_DIR ]; then
+    echo "Apollo 3 SDK does not exist"
+    echo "Either the SDK has not been downloaded, or this script is not being run from the root of the repository"
+else
+    DEST_DIR="tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb"
+    cp "$AP3_DIR/boards/apollo3_evb/examples/hello_world/gcc/startup_gcc.c" "$DEST_DIR"
+    cp "$AP3_DIR/boards/apollo3_evb/examples/hello_world/gcc/hello_world.ld" "$DEST_DIR/apollo3evb.ld"
+    sed -i -e '131s/1024/1024\*20/g' "$DEST_DIR/startup_gcc.c"
+    sed -i -e 's/main/_main/g' "$DEST_DIR/startup_gcc.c"
+    sed -i -e '3s/hello_world.ld/apollo3evb.ld/g' "$DEST_DIR/apollo3evb.ld"
+    sed -i -e '3s/startup_gnu/startup_gcc/g' "$DEST_DIR/apollo3evb.ld"
+    sed -i -e '6s/am_reset_isr/Reset_Handler/g' "$DEST_DIR/apollo3evb.ld"
+    sed -i -e '22s/\*(.text\*)/\*(.text\*)\n\n\t\/\* These are the C++ global constructors.  Stick them all here and\n\t \* then walk through the array in main() calling them all.\n\t \*\/\n\t_init_array_start = .;\n\tKEEP (\*(SORT(.init_array\*)))\n\t_init_array_end = .;\n\n\t\/\* XXX Currently not doing anything for global destructors. \*\/\n/g' "$DEST_DIR/apollo3evb.ld"
+    sed -i -e "70s/} > SRAM/} > SRAM\n    \/\* Add this to satisfy reference to symbol 'end' from libnosys.a(sbrk.o)\n     \* to denote the HEAP start.\n     \*\/\n   end = .;/g" "$DEST_DIR/apollo3evb.ld"
+    echo "Finished preparing Apollo3 files"
+    
+
+fi
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
index f722204feaded521945cd269b36576e560dac3e4..6ed402a623188a7c39a007a1cfd7dbc67b775103 100644
--- a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
@@ -1,13 +1,14 @@
 # Settings for apollo3 evb platforms.
 ifeq ($(TARGET), apollo3evb)
+  export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH)
   TARGET_ARCH := cortex-m4
   TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
   # Download the Ambiq Apollo3 SDK and set this variable to find the header
   # files:
-  APOLLO3_SDK := /ssd/ambiq/AmbiqSuite\ SDK\ for\ Apollo3/Apollo3-SDK-2018.08.13/
+  APOLLO3_SDK := $(MAKEFILE_DIR)/downloads/Apollo3-SDK-2018.08.13
   # Need a pointer to the GNU ARM toolchain for crtbegin.o for the fp functions
   # with the softfp interfaces.
-  GCC_ARM := /ssd/gnu_arm_toolchain/gcc-arm-none-eabi-7-2018-q2-update/
+  GCC_ARM := $(MAKEFILE_DIR)/downloads/gcc_embedded/
 
   PLATFORM_FLAGS = \
     -DPART_apollo3 \
@@ -16,6 +17,8 @@ ifeq ($(TARGET), apollo3evb)
     -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
     -DTF_LITE_STATIC_MEMORY \
     -DTF_LITE_MCU_DEBUG_LOG \
+    -D __FPU_PRESENT=1 \
+    -DARM_MATH_CM4 \
     -fno-rtti \
     -fmessage-length=0 \
     -fno-exceptions \
@@ -41,8 +44,8 @@ ifeq ($(TARGET), apollo3evb)
     -fomit-frame-pointer \
     -fpermissive \
     -nostdlib \
-    -g \
-    -Os
+    -ggdb \
+    -O3
   CXXFLAGS += $(PLATFORM_FLAGS)
   CCFLAGS += $(PLATFORM_FLAGS)
   LDFLAGS += \
@@ -52,17 +55,18 @@ ifeq ($(TARGET), apollo3evb)
     -Wl,--start-group -lm -lc -lgcc -Wl,--end-group \
     -fno-exceptions \
     -nostdlib --specs=nano.specs -t -lstdc++ -lc -lnosys -lm \
-    -Wl,-T,$(MAKEFILE_DIR)/targets/apollo3evb/apollo3evb.ld \
+    -Wl,-T,$(APOLLO3_SDK)/boards/apollo3_evb/examples/hello_world/gcc_patched/apollo3evb.ld \
     -Wl,-Map=$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref
   BUILD_TYPE := micro
-  # The apollo3evb libs should be copied from the SDK after building them.
   MICROLITE_LIBS := \
-    $(MAKEFILE_DIR)/targets/apollo3evb/libam_bsp.a \
-    $(MAKEFILE_DIR)/targets/apollo3evb/libam_hal.a \
+    $(APOLLO3_SDK)/boards/apollo3_evb/bsp/gcc/bin/libam_bsp.a \
+    $(APOLLO3_SDK)/mcu/apollo3/hal/gcc/bin/libam_hal.a \
     $(GCC_ARM)/lib/gcc/arm-none-eabi/7.3.1/thumb/v7e-m/fpv4-sp/softfp/crtbegin.o \
     -lm
   INCLUDES += \
     -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
+    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Include/ \
+    -I$(MAKEFILE_DIR)/downloads/CMSIS_ext/ \
     -I$(GCC_ARM)/arm-none-eabi/ \
     -I$(APOLLO3_SDK)/mcu/apollo3/ \
     -I$(APOLLO3_SDK)/CMSIS/AmbiqMicro/Include/ \
@@ -79,26 +83,37 @@ ifeq ($(TARGET), apollo3evb)
   # setting clock speed, default uart setups, etc. and an implementation
   # of the DebugLog interfaces.
   MICROLITE_CC_SRCS += \
-    $(MAKEFILE_DIR)/targets/apollo3evb/startup_gcc.c \
-    $(MAKEFILE_DIR)/targets/apollo3evb/_main.c \
-    $(MAKEFILE_DIR)/targets/apollo3evb/am_util_delay.c \
-    $(MAKEFILE_DIR)/targets/apollo3evb/am_util_faultisr.c \
-    $(MAKEFILE_DIR)/targets/apollo3evb/am_util_id.c \
-    $(MAKEFILE_DIR)/targets/apollo3evb/am_util_stdio.c
+    $(APOLLO3_SDK)/boards/apollo3_evb/examples/hello_world/gcc_patched/startup_gcc.c \
+    $(APOLLO3_SDK)/utils/am_util_delay.c \
+    $(APOLLO3_SDK)/utils/am_util_faultisr.c \
+    $(APOLLO3_SDK)/utils/am_util_id.c \
+    $(APOLLO3_SDK)/utils/am_util_stdio.c
+
+  CMSIS_SRC_DIR := tensorflow/lite/experimental/micro/tools/make/downloads/cmsis/CMSIS/DSP/Source
+  CMSIS_SRCS := \
+  $(CMSIS_SRC_DIR)/BasicMathFunctions/arm_mult_q15.c \
+  $(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_init_q15.c \
+  $(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_q15.c \
+  $(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_q15.c \
+  $(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_radix4_q15.c \
+  $(CMSIS_SRC_DIR)/CommonTables/arm_const_structs.c \
+  $(CMSIS_SRC_DIR)/CommonTables/arm_common_tables.c \
+  $(CMSIS_SRC_DIR)/StatisticsFunctions/arm_mean_q15.c \
+  $(CMSIS_SRC_DIR)/StatisticsFunctions/arm_max_q7.c
+
+  AP3_EXT_MICRO_DIR := $(MAKEFILE_DIR)/downloads/apollo3_ext
+  AP3_MICRO_DIR := tensorflow/lite/experimental/micro/examples/micro_speech/apollo3
+  CMSIS_DIR := tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS
+  CMSIS_EXT_DIR := $(MAKEFILE_DIR)/downloads/CMSIS_ext
+
+  MICRO_SPEECH_TEST_SRCS += \
+    $(AP3_MICRO_DIR)/_main.c
 
   TEST_SCRIPT := tensorflow/lite/experimental/log_test/test_apollo3evb_binary.sh
-  # These are tests that don't currently work on the blue pill.
+  # These are tests that don't currently work on the Apollo3 board.
   EXCLUDED_TESTS := \
     tensorflow/lite/experimental/micro/micro_interpreter_test.cc \
     tensorflow/lite/experimental/micro/simple_tensor_allocator_test.cc
   MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
 
-# These are microcontroller-specific rules for converting the ELF output
-# of the linker into a binary image that can be loaded directly.
-OBJCOPY := $(TARGET_TOOLCHAIN_PREFIX)objcopy
-
-$(BINDIR)/%.bin: $(BINDIR)/%
-	@mkdir -p $(dir $@)
-	$(OBJCOPY) $< $@ -O binary
-
 endif
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/bluepill_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/bluepill_makefile.inc
index 5e3105a109b99b061a35b9c6f6c7c5f3681e2b45..b344f844bca7e7045eafaba141dc5e6371c3f496 100644
--- a/tensorflow/lite/experimental/micro/tools/make/targets/bluepill_makefile.inc
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/bluepill_makefile.inc
@@ -47,7 +47,10 @@ ifeq ($(TARGET), bluepill)
   MICROLITE_CC_SRCS += \
     $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.c) \
     $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.cc)
-    TEST_SCRIPT := tensorflow/lite/experimental/micro/testing/test_bluepill_binary.sh
+  EXCLUDED_SRCS := \
+    $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/debug_log.c
+  MICROLITE_CC_SRCS := $(filter-out $(EXCLUDED_SRCS), $(MICROLITE_CC_SRCS))
+  TEST_SCRIPT := tensorflow/lite/experimental/micro/testing/test_bluepill_binary.sh
   # These are tests that don't currently work on the blue pill.
   EXCLUDED_TESTS := \
     tensorflow/lite/experimental/micro/micro_interpreter_test.cc \
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/README.txt b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/README.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3cb74a72437be8017527c0ea05a1b82eb1a4ac9e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/README.txt
@@ -0,0 +1,32 @@
+Running The Micro Speech Example On Eta Compute's ECM3531EVB
+
+This code will enable you to compile and execute the Tensorflow Lite Micro Speech Example on Eta Computes's low power ECM3531 chip.
+
+
+GETTING STARTED:
+
+1. Download the Tensorflow code from Github and follow instructions there to download other dependencies.  
+
+2. Download the Eta Compute SDK, version 0.0.17.
+
+3. Install the Arm compiler arm-none-eabi-gcc, version = arm-none-eabi-gcc (GNU Tools for Arm Embedded Processors 7-2018-q2-update) 7.3.1 20180622 (release) [ARM/embedded-7-branch revision 261907]
+
+4. Edit the file   tensorflow/lite/experimental/micro/tools/make/targets/ecm3531_makefile.inc  so that the variable ETA_SDK points to the location where the Eta Compute SDK is installed, and the variable GCC_ARM points to the Arm compiler.
+
+5. Compile the code with the command   "make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=ecm3531 test".  This will create the executable tensorflow/lite/experimental/micro/tools/make/gen/ecm3531_cortex-m3/bin/micro_speech_test.
+
+6. Connect the board to the host computer, start PuTTY (Connection type = Serial, Speed = 11520, Data bits = 8, Stop bits = 1,  Parity = None), and load the executable with ocd.  A sample script for loading the image is provided in tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/load_program.  
+
+The following  will be printed on the Uart:
+
+Testing TestInvoke
+Ran successfully
+
+/ tests passed
+~~~ALL TESTS PASSED~~~
+
+
+
+CONTACT INFORMATION:
+
+Contact info@etacompute.com  for more information on obtaining the Eta Compute SDK and evalution board.
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/_main.c b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/_main.c
new file mode 100644
index 0000000000000000000000000000000000000000..2764f3ba50de699fa72717585114369cf833d76e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/_main.c
@@ -0,0 +1,95 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* This is file contains the entry point to the application and is called after
+   startup.
+   The GPIOs, Uart and timer are intialized and Tensorflow is invoked with the
+   call to main().
+   Tensorflow will print out if the tests have passed or failed and the
+   execution time is also
+   printed. */
+
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include "eta_bsp.h"
+#include "eta_chip.h"
+#include "eta_csp.h"
+#include "eta_csp_buck.h"
+#include "eta_csp_gpio.h"
+#include "eta_csp_io.h"
+#include "eta_csp_pwr.h"
+#include "eta_csp_rtc.h"
+#include "eta_csp_socctrl.h"
+#include "eta_csp_sys_clock.h"
+#include "eta_csp_timer.h"
+#include "eta_csp_uart.h"
+
+tUart g_sUart0 = {eUartNum0, eUartBaud115200};
+tUart g_sUart1 = {eUartNum1, eUartBaud115200};
+
+int init_main(int);
+void EtaPrintExecutionTime(uint64_t);
+
+//*****************************************************************************
+//
+// The entry point for the application.
+//
+//*****************************************************************************
+extern int main(int argc, char** argv);
+
+void DebugLog(const char* s) { EtaCspIoPrintf("%s", s); }
+void DebugLogInt32(int32_t i) { EtaCspIoPrintf("%d", i); }
+void DebugLogUInt32(uint32_t i) { EtaCspIoPrintf("%d", i); }
+void DebugLogHex(uint32_t i) { EtaCspIoPrintf("0x%8x", i); }
+void DebugLogFloat(float i) { EtaCspIoPrintf("%f", i); }
+
+int _main(void) {
+  uint64_t time_ms;
+
+  EtaCspInit();      // initialize csp registers
+  EtaCspGpioInit();  // initialize gpios
+  EtaCspUartInit(&g_sUart1, eUartNum0, eUartBaud115200,
+                 eUartFlowControlHardware);  // initialize Uart
+  EtaCspBuckInit(ETA_BSP_VDD_IO_SETTING, eBuckAo600Mv, eBuckM3Frequency60Mhz,
+                 eBuckMemVoltage900Mv);  // set M3 freq
+  EtaCspTimerInitMs();                   // start timer
+  main(0, NULL);  // Call to Tensorflow; this will print if test was successful.
+  time_ms = EtaCspTimerCountGetMs();  // read time
+  EtaPrintExecutionTime(time_ms);     // print execution time
+}
+
+void EtaPrintExecutionTime(uint64_t time_ms) {
+  uint8_t c;
+  int k1;
+  char time_string[] = "00000";
+
+  EtaCspIoPrintf("Execution time (msec) = ");
+  if (time_ms < 100000)  // Convert time to a string
+  {
+    for (k1 = 0; k1 < 5; k1++) {
+      c = time_ms % 10;
+      time_ms = time_ms / 10;
+      time_string[k1] = (char)(0x30 + c);
+    }
+    for (k1 = 4; k1 >= 0; k1--) {  // print out 1 char at a time
+      EtaCspUartPutc(&g_sUart1, time_string[k1]);
+    }
+  } else {
+    EtaCspIoPrintf("Execution time exceeds 100 sec\n");
+  }
+  EtaCspIoPrintf("\n\n");
+}
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531.lds b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531.lds
new file mode 100644
index 0000000000000000000000000000000000000000..af34f988f2d04a0c1c87f20d6058df560db7e2c5
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531.lds
@@ -0,0 +1,85 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+
+/*
+ * linker script for use with ECM3531
+ * All sections must map to 128KBytes of SRAM beginning at 0x10000000
+ *
+ */
+
+ /*
+  * Indicate to the linker the entry point.
+  */
+ENTRY(ResetISR)
+
+/*
+ *   SRAM is at 0x10000000 of length 0x00020000
+ */
+MEMORY
+{
+    SRAM (RWX) : ORIGIN = 0x10000000, LENGTH = 0x00020000
+}
+
+SECTIONS
+{
+    .text :
+    {
+        _text = .;
+        KEEP(*(.vectors))
+        . = ALIGN(0x4);
+        *(.text*)
+        . = ALIGN(0x4);
+        *(.rodata*)
+        . = ALIGN(0x4);
+        _etext = .;
+    } > SRAM= 0
+    .dummy :
+    {
+        . = ALIGN(0x4);
+        _eftext = .;
+    } > SRAM
+    .datax :
+    {
+        _datax = .;
+        KEEP(*(.mainStack))
+        . += 12288;
+        _edatax = .;
+        _stack_top = .;
+        . += 4;
+    } > SRAM
+    .data :
+       AT (ADDR(.text) + SIZEOF(.text) ) 
+    {
+        _data = .;
+        *(.data*)
+        KEEP(*(.mainHeap))
+        _edata = .;
+    } > SRAM
+
+    .bss :
+    {
+        _bss = .;
+        *(.bss*)
+        *(COMMON)
+        _ebss = .;
+    } > SRAM
+    .ARM.exidx :
+    {
+       *(.ARM.exidx*)
+    }
+
+}
+
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/load_program b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/load_program
new file mode 100755
index 0000000000000000000000000000000000000000..ac1f49962a61756ccbde02300c612bd7b4f48e84
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/load_program
@@ -0,0 +1,53 @@
+#!/usr/bin/python3
+#Usage: cd to the directory  tensorflow/lite/experimental/micro/tools/make/targets/ecm3531 and type ./load_prgram to load the executable tensorflow/lite/experimental/micro/tools/make/gen/ecm3531_cortex-m3/bin/micro_speech_test into SRAM
+#
+#
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+#==============================================================================
+
+
+import os
+import telnetlib
+
+def send_ocd_cmd(line):
+    ocd_sock.write(bytes(line,encoding = 'utf-8'))
+    print(ocd_sock.read_until(b'> ').decode('utf-8'), end='')
+
+def get_ocd_response():
+    print(ocd_sock.read_until(b'> ').decode('utf-8'), end='')
+
+#get hooked up to openocd daemon
+ocd_sock = telnetlib.Telnet(host='localhost', port=4444)
+get_ocd_response() # clean it out
+
+# git path to project elf file
+cur_dir = os.getcwd()
+elf_file = cur_dir + '/../../gen/ecm3531_cortex-m3/bin/' + 'micro_speech_test'
+print("elf_file = ",elf_file)
+
+
+# use these to download and run the elf fle
+ocd_commands = ["halt\n",
+                "load_image {}\n".format(elf_file),
+                "mww 0x1001FFF8 0xDEADBEEF\n",
+                "mww 0x1001FFFC 0xC369A517\n",
+                "reset\n"]
+
+# OK now do what we came here for!!!
+for x in ocd_commands: 
+    print(x)
+    send_ocd_cmd(x)
+
+
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/startup.c b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/startup.c
new file mode 100644
index 0000000000000000000000000000000000000000..32d817ba4882f9123a9ed6321f9339355d82db5c
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/startup.c
@@ -0,0 +1,432 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* This file is called at power up time to initialize the chip.  It in turn
+calls _main() which is the entry point into the application */
+
+#include <stdint.h>
+#include "eta_chip.h"
+#include "memio.h"
+
+#ifndef NULL
+#define NULL (0)
+#endif
+
+//*****************************************************************************
+//
+// Macro for hardware access, both direct and via the bit-band region.
+//
+//*****************************************************************************
+
+int _main(int argc, char *argv[]);
+void set_vtor(void);
+void *startup_get_my_pc(void);
+
+//*****************************************************************************
+// Forward DECLS for interrupt service routines (ISR)
+//*****************************************************************************
+extern void ResetISR(void) __attribute__((weak, alias("default_ResetISR")));
+extern void NmiSR(void) __attribute__((weak, alias("default_NmiSR")));
+extern void FaultISR(void) __attribute__((weak, alias("default_FaultISR")));
+
+extern void DebugMonitor_ISR(void)
+    __attribute__((weak, alias("default_DebugMonitor_ISR")));
+extern void SVCall_ISR(void) __attribute__((weak, alias("default_SVCall_ISR")));
+extern void PENDSV_ISR(void) __attribute__((weak, alias("default_PENDSV_ISR")));
+
+extern void SYSTICK_ISR(void)
+    __attribute__((weak, alias("default_SYSTICK_ISR")));
+
+extern void GPIO0_ISR(void) __attribute__((weak, alias("default_GPIO0_ISR")));
+extern void GPIO1_ISR(void) __attribute__((weak, alias("default_GPIO1_ISR")));
+extern void TIMER0_ISR(void) __attribute__((weak, alias("default_TIMER0_ISR")));
+extern void TIMER1_ISR(void) __attribute__((weak, alias("default_TIMER1_ISR")));
+extern void UART0_ISR(void) __attribute__((weak, alias("default_UART0_ISR")));
+extern void UART1_ISR(void) __attribute__((weak, alias("default_UART1_ISR")));
+extern void SPI0_ISR(void) __attribute__((weak, alias("default_SPI0_ISR")));
+extern void SPI1_ISR(void) __attribute__((weak, alias("default_SPI1_ISR")));
+extern void I2C0_ISR(void) __attribute__((weak, alias("default_I2C0_ISR")));
+extern void I2C1_ISR(void) __attribute__((weak, alias("default_I2C1_ISR")));
+extern void RTC0_ISR(void) __attribute__((weak, alias("default_RTC0_ISR")));
+extern void RTC1_ISR(void) __attribute__((weak, alias("default_RTC1_ISR")));
+extern void DSP_ISR(void) __attribute__((weak, alias("default_DSP_ISR")));
+extern void ADC_ISR(void) __attribute__((weak, alias("default_ADC_ISR")));
+extern void SW0_ISR(void) __attribute__((weak, alias("default_SW0_ISR")));
+extern void SW1_ISR(void) __attribute__((weak, alias("default_SW1_ISR")));
+extern void PWM_ISR(void) __attribute__((weak, alias("default_PWM_ISR")));
+extern void WDT_ISR(void) __attribute__((weak, alias("default_WDT_ISR")));
+extern void RTC_TMR_ISR(void)
+    __attribute__((weak, alias("default_RTC_TMR_ISR")));
+
+extern void SW2_ISR(void) __attribute__((weak, alias("default_SW1_ISR")));
+extern void SW3_ISR(void) __attribute__((weak, alias("default_SW1_ISR")));
+extern void SW4_ISR(void) __attribute__((weak, alias("default_SW1_ISR")));
+extern void SW5_ISR(void) __attribute__((weak, alias("default_SW1_ISR")));
+extern void SW6_ISR(void) __attribute__((weak, alias("default_SW1_ISR")));
+
+extern void IntDefaultHandler(void) __attribute__((weak));
+
+//*****************************************************************************
+//
+// Reserve space for the system stack.
+//
+//*****************************************************************************
+extern uint32_t _stack_top;
+//__attribute__ ((section(".mainStack"), used))
+// static uint32_t pui32Stack[2048];
+#define STARTUP_STACK_TOP (&_stack_top)
+
+//*****************************************************************************
+// VECTOR TABLE
+//*****************************************************************************
+__attribute__((section(".vectors"), used)) void (*const gVectors[])(void) = {
+    //(void (*)(void))((uint32_t)pui32Stack + sizeof(pui32Stack)), // Stack
+    // pointer
+    (void *)STARTUP_STACK_TOP,
+    ResetISR,           // Reset handler
+    NmiSR,              // The NMI handler
+    FaultISR,           // The hard fault handler
+    IntDefaultHandler,  // 4 The MPU fault handler
+    IntDefaultHandler,  // 5 The bus fault handler
+    IntDefaultHandler,  // 6 The usage fault handler
+    0,                  // 7 Reserved
+    0,                  // 8 Reserved
+    0,                  // 9 Reserved
+    0,                  // 10 Reserved
+    SVCall_ISR,         // 11 SVCall handler
+    DebugMonitor_ISR,   // 12 Debug monitor handler
+    0,                  // 13 Reserved
+    PENDSV_ISR,         // 14 The PendSV handler
+    SYSTICK_ISR,        // 15 The SysTick handler
+
+    // external interrupt service routines (ISR)
+    GPIO0_ISR,    // 16 GPIO Port A            [ 0]
+    GPIO1_ISR,    // 17 GPIO Port B            [ 1]
+    TIMER0_ISR,   // 18 Timer 0                [ 2]
+    TIMER1_ISR,   // 19 Timer 1                [ 3]
+    UART0_ISR,    // 20 UART 0                 [ 4]
+    UART1_ISR,    // 21 UART 1                 [ 5]
+    SPI0_ISR,     // 22 SPI0                   [ 6]
+    SPI1_ISR,     // 23 SPI1                   [ 7]
+    I2C0_ISR,     // 24 I2C 0                  [ 8]
+    I2C1_ISR,     // 25 I2C 1                  [ 9]
+    RTC0_ISR,     // 26 RTC 0                  [10]
+    RTC1_ISR,     // 27 RTC 1                  [11]
+    DSP_ISR,      // 28 DSP MAILBOX            [12]
+    ADC_ISR,      // 29 ADC                    [13]
+    PWM_ISR,      // 32 PWM                    [14]
+    WDT_ISR,      // 33 WDT                    [15]
+    RTC_TMR_ISR,  // 34 RTC                    [16]
+
+    SW0_ISR,  // 30 Software Interrupt 0   [17]
+    SW1_ISR,  // 31 Software Interrupt 1   [18]
+    SW2_ISR,  // 35 Software Interrupt 2   [19]
+    SW3_ISR,  // 36 Software Interrupt 3   [20]
+    SW4_ISR,  // 37 Software Interrupt 4   [21]
+    SW5_ISR,  // 38 Software Interrupt 5   [22]
+    SW6_ISR,  // 39 Software Interrupt 6   [23]
+
+};
+
+//*****************************************************************************
+//
+// The following are constructs created by the linker, indicating where the
+// the "data" and "bss" segments reside in memory.  The initializers for the
+// for the "data" segment resides immediately following the "text" segment.
+//
+//*****************************************************************************
+extern uint32_t _etext;
+extern uint32_t _eftext;
+extern uint32_t _data;
+extern uint32_t _edata;
+extern uint32_t _bss;
+extern uint32_t _ebss;
+
+//
+// And here are the weak interrupt handlers.
+//
+void default_NmiSR(void) {
+  __asm("    movs     r0, #2");
+  while (1) {
+  }
+}
+
+void default_FaultISR(void) {
+  __asm("    movs     r0, #3");
+  MEMIO32(0x1001FFF0) = 0xbad0beef;  // near the top of 128KB of SRAM
+  MEMIO32(0x1001FFF4) = 0xbad1beef;  // near the top of 128KB of SRAM
+  while (1) {
+    __asm("    BKPT      #1");
+  }
+}
+
+void IntDefaultHandler(void) {
+  __asm("    movs     r0, #20");
+  while (1) {
+    __asm("    BKPT      #1");
+  }
+}
+
+void default_SVCall_ISR(void) {
+  __asm("    movs     r0, #11");
+  while (1) {
+    __asm("    BKPT      #11");
+  }
+}
+
+void default_DebugMonitor_ISR(void) {
+  __asm("    movs     r0, #12");
+  while (1) {
+    __asm("    BKPT      #12");
+  }
+}
+
+void default_PENDSV_ISR(void) {
+  __asm("    movs     r0, #14");
+  while (1) {
+    __asm("    BKPT      #14");
+  }
+}
+
+void default_SYSTICK_ISR(void) {
+  __asm("    movs     r0, #15");
+  while (1) {
+    __asm("    BKPT      #15");
+  }
+}
+
+//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+void default_SPI0_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_SPI1_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_I2C0_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_I2C1_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_UART0_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_UART1_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_GPIO0_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_GPIO1_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_ADC_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_DSP_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_TIMER0_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_TIMER1_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_RTC0_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_RTC1_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_PWM_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_WDT_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_RTC_TMR_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_SW0_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_SW1_ISR(void) {
+  __asm("    movs     r0, #17");
+  while (1) {
+    __asm("    BKPT      #17");
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Reset ISR
+////////////////////////////////////////////////////////////////////////////////
+void default_ResetISR(void) {
+  int rc;
+  bool bRunningInFlash;
+
+  set_vtor();
+
+  bRunningInFlash =
+      ((((uint32_t)startup_get_my_pc()) & 0xFF000000) == 0x01000000);
+
+  if ((!REG_RTC_AO_CSR.BF.WARM_START_MODE) || bRunningInFlash) {
+    //
+    //  Copy any .ro bytes to .data so that initialized global variables
+    //  are actually properly initialized.
+    //
+    __asm(
+        "    ldr      r0, =_eftext\n"
+        "    ldr      r1, =_data\n"
+        "    ldr      r2, =_edata\n"
+        "ro_copy_loop:\n"
+        "    ldr      r3, [r0], #4\n"
+        "    str      r3, [r1], #4\n"
+        "    cmp      r1, r2\n"
+        "    ble      ro_copy_loop\n");
+
+    //
+    // Zero fill the .bss section.
+    //
+    __asm(
+        "    ldr      r0, =_bss\n"
+        "    ldr      r1, =_ebss\n"
+        "    mov      r2, #0\n"
+        "bss_zero_loop:\n"
+        "    cmp      r0, r1\n"
+        "    it       lt\n"
+        "    strlt    r2, [r0], #4\n"
+        "    blt      bss_zero_loop\n");
+  }
+
+  //
+  // call the main routine barefoot, i.e. without the normal CRTC0 entry
+  // point.
+  //
+  rc = _main(0, NULL);
+
+  //
+  //  If main ever returns, trap it here and wake up the debugger if it is
+  //  connected.
+  //
+  while (1)  // for FPGA/real chip use
+  {
+    __asm("    BKPT      #1");
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// get my PC
+////////////////////////////////////////////////////////////////////////////////
+void *startup_get_my_pc(void) {
+  void *pc;
+  asm("mov %0, pc" : "=r"(pc));
+  return pc;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// get my SP
+////////////////////////////////////////////////////////////////////////////////
+void *startup_get_my_sp(void) {
+  void *sp;
+  asm("mov %0, sp" : "=r"(sp));
+  return sp;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Set VTOR based on PC
+////////////////////////////////////////////////////////////////////////////////
+void set_vtor(void) {
+  __asm(
+      "    ldr      r0, =0xe000ed08\n"
+      "    ldr      r1, =0xFF000000\n"
+      "    mov      r2, lr\n"
+      "    and      r1, r2\n"
+      "    str      r1, [r0]\n");
+
+  return;
+}
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531_makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..baae58f87e1761c978a87256fda8b7e90edb79e5
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531_makefile.inc
@@ -0,0 +1,103 @@
+# Settings for eta ecm3531 platform
+ifeq ($(TARGET), ecm3531)
+  TARGET_ARCH := cortex-m3
+  TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
+  ETA_SDK := /home/hari/TensaiSDK-v0.0.17/soc/
+  GCC_ARM := /home/hari/Downloads/gcc-arm-none-eabi-7-2018-q2-update/
+
+  ifeq ($(wildcard $(ETA_SDK)),)
+    $(error Path to ETA SDK is not set (ETA_SDK))
+  endif
+
+  ifeq ($(wildcard $(GCC_ARM)),)
+    $(error Path to gcc arm compiler is not set (GCC_ARM))
+  endif
+
+  PLATFORM_FLAGS = \
+    -DFIRMWARE_BUILD \
+    -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
+    -DTF_LITE_STATIC_MEMORY \
+    -DTF_LITE_MCU_DEBUG_LOG \
+    -fno-rtti \
+    -fmessage-length=0 \
+    -fno-exceptions \
+    -fno-unwind-tables \
+    -fno-builtin \
+    -ffunction-sections \
+    -fdata-sections \
+    -funsigned-char \
+    -MMD \
+    -mcpu=cortex-m3 \
+    -mthumb \
+    -mlittle-endian \
+    -mno-unaligned-access \
+    -std=gnu++11 \
+    -Wvla \
+    -Wall \
+    -Wextra \
+    -Wno-unused-parameter \
+    -Wno-missing-field-initializers \
+    -Wno-write-strings \
+    -Wno-sign-compare \
+    -fno-delete-null-pointer-checks \
+    -fomit-frame-pointer \
+    -fpermissive \
+    -nostdlib \
+    -g \
+    -Os
+  CXXFLAGS += $(PLATFORM_FLAGS)
+  CCFLAGS += $(PLATFORM_FLAGS)
+# Adding the --specs=nano.specs flag causes the linker to use libc_nano.a
+# instead of libc.a.  This gets rid of lots of errors with various pieces
+# of the exception unwinding code not being found.  Not clear why it is
+# trying to link in this code to begin with, though.
+  LDFLAGS += \
+    -mthumb -mcpu=cortex-m3 \
+    -nostartfiles -static \
+    -Wl,--gc-sections -Wl,--entry,ResetISR \
+    -Wl,--start-group -lm -lc -lgcc -Wl,--end-group \
+    -fno-exceptions \
+    -nostdlib --specs=nano.specs -t -lstdc++ -lc -lnosys -lm \
+    -Wl,-T,$(MAKEFILE_DIR)/targets/ecm3531/ecm3531.lds \
+    -Wl,-Map=$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref
+  BUILD_TYPE := micro
+  MICROLITE_LIBS := \
+    $(GCC_ARM)/lib/gcc/arm-none-eabi/7.3.1/thumb/v7e-m/fpv4-sp/softfp/crtbegin.o \
+    -lm
+  INCLUDES += \
+    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
+    -I$(GCC_ARM)/arm-none-eabi/include/ \
+    -I$(ETA_SDK)/ecm3531/boards/eta_evb/projects/m3/common/inc/ \
+    -I$(ETA_SDK)/ecm3531/m3/reg/inc/ \
+    -I$(ETA_SDK)/ecm3531/m3/csp/inc/ \
+    -I$(ETA_SDK)/ecm3531/common/csp/inc/ \
+    -I$(ETA_SDK)/common/inc/  \
+    -I$(ETA_SDK)/../utils/inc/  \
+    -I$(ETA_SDK)/ecm3531/boards/eta_evb/eta_bsp/inc
+
+  # _main.c contains application and target specific initialization, like
+  # setting clock speed, default uart setups, etc. and an implementation
+  # of the DebugLog interfaces.
+  MICROLITE_CC_SRCS += \
+    $(MAKEFILE_DIR)/targets/ecm3531/startup.c \
+    $(MAKEFILE_DIR)/targets/ecm3531/_main.c \
+    $(wildcard $(ETA_SDK)/ecm3531/boards/eta_evb/projects/m3/common/src/*.c) \
+    $(wildcard $(ETA_SDK)/ecm3531/m3/csp/src/*.c) \
+    $(wildcard $(ETA_SDK)/ecm3531/m3/csp/src/*.s) \
+
+  TEST_SCRIPT := tensorflow/lite/experimental/micro/testing/test_ecm3531_binary.sh
+  # These are tests that don't currently work on the blue pill.
+  EXCLUDED_TESTS := \
+    tensorflow/lite/experimental/micro/micro_interpreter_test.cc \
+    tensorflow/lite/experimental/micro/simple_tensor_allocator_test.cc
+  MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
+
+# These are microcontroller-specific rules for converting the ELF output
+# of the linker into a binary image that can be loaded directly.
+OBJCOPY := $(TARGET_TOOLCHAIN_PREFIX)objcopy
+
+$(BINDIR)/%.bin: $(BINDIR)/%
+	@mkdir -p $(dir $@)
+	$(OBJCOPY) $< $@ -O binary
+
+endif
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/mbed_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/mbed_makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..161ff34cdbda07768d33b9af45ed9655665b9bfd
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/mbed_makefile.inc
@@ -0,0 +1,4 @@
+# Settings for mbed platforms.
+ifeq ($(TARGET), mbed)
+  TARGET_ARCH := cortex-m4
+endif
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/osx_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/osx_makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..3b91eeff9fd5f2df06caa9a5f73b221815f9bbdf
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/osx_makefile.inc
@@ -0,0 +1,10 @@
+# Settings for Mac OS platforms.
+ifeq ($(TARGET), osx)
+
+  PLATFORM_FLAGS = \
+    -DTF_LITE_DISABLE_X86_NEON
+
+  CXXFLAGS += $(PLATFORM_FLAGS)
+  CCFLAGS += $(PLATFORM_FLAGS)
+
+endif
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/AUDIO_DISCO_F746NG.lib.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/AUDIO_DISCO_F746NG.lib.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..11dae1ea16c4ac990af07aebd8b5e59ff748fc2d
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/AUDIO_DISCO_F746NG.lib.tpl
@@ -0,0 +1 @@
+https://os.mbed.com/teams/ST/code/AUDIO_DISCO_F746NG/#7046ce26b7ed
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/BSP_DISCO_F746NG.lib.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/BSP_DISCO_F746NG.lib.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..48dc1317072d537b3c61b0481b272855eb5941be
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/BSP_DISCO_F746NG.lib.tpl
@@ -0,0 +1 @@
+https://os.mbed.com/teams/ST/code/BSP_DISCO_F746NG/#df2ea349c37a
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/Makefile.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/Makefile.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..74d54f1ebee12d7773edfd1b073ddf17dd3791d6
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/Makefile.tpl
@@ -0,0 +1,26 @@
+SRCS := \
+%{SRCS}%
+
+OBJS := \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(SRCS)))
+
+INCLUDES := \
+-I. \
+-I./third_party/gemmlowp \
+-I./third_party/flatbuffers/include
+
+CXXFLAGS += %{CXX_FLAGS}%
+
+LDFLAGS += %{LINKER_FLAGS}%
+
+%.o: %.cc
+	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+
+%.o: %.c
+	$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
+
+%{EXECUTABLE}% : $(OBJS)
+	$(CXX) $(LDFLAGS) $(OBJS) \
+	-o $@
+
+all: %{EXECUTABLE}%
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/README_MAKE.md.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/README_MAKE.md.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..7906a3226ab5b475d3f0f93f39111e8e21d39a40
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/README_MAKE.md.tpl
@@ -0,0 +1,29 @@
+# TensorFlow Lite Micro Make Project
+
+This folder has been autogenerated by TensorFlow, and contains source, header,
+and project files needed to build a single TensorFlow Lite Micro target using
+the make tool.
+
+## Usage
+
+To build this, run:
+
+```
+make
+```
+
+This should attempt to build the target locally on your platform, using the
+standard Makefile variables like CFLAGS, CC, CXX, and so on.
+
+## Project Generation
+
+See
+[tensorflow/lite/experimental/micro](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro)
+for details on how projects like this can be generated from the main source
+tree.
+
+## License
+
+TensorFlow's code is covered by the Apache2 License included in the repository,
+and third party dependencies are covered by their respective licenses, in the
+third_party folder of this package.
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/README_MBED.md.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/README_MBED.md.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..2682236edf5b847a95aa07fa6d0e30c5a9a10c9a
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/README_MBED.md.tpl
@@ -0,0 +1,48 @@
+# TensorFlow Lite Micro Mbed Project
+
+This folder has been autogenerated by TensorFlow, and contains source, header,
+and project files needed to build a single TensorFlow Lite Micro target using
+the Mbed command line interface.
+
+## Usage
+
+To load the dependencies this code requires, run:
+
+```
+mbed config root .
+mbed deploy
+```
+
+TensorFlow requires C++ 11, so you'll need to update your profiles to reflect
+this. Here's a short Python command that does that:
+
+```
+python -c 'import fileinput, glob;
+for filename in glob.glob("mbed-os/tools/profiles/*.json"):
+  for line in fileinput.input(filename, inplace=True):
+    print line.replace("\"-std=gnu++98\"","\"-std=c++11\", \"-fpermissive\"")'
+```
+
+With that setting updated, you should now be able to compile:
+
+```
+mbed compile -m auto -t GCC_ARM
+```
+
+If this works, it will give you a .bin file that you can flash onto the device
+you're targeting. For example, using a Discovery STM3246G board, you can deploy
+it by copying the bin to the volume mounted as a USB drive, just by dragging
+over the file.
+
+## Project Generation
+
+See
+[tensorflow/lite/experimental/micro](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro)
+for details on how projects like this can be generated from the main source
+tree.
+
+## License
+
+TensorFlow's code is covered by the Apache2 License included in the repository,
+and third party dependencies are covered by their respective licenses, in the
+third_party folder of this package.
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/SDRAM_DISCO_F746NG.lib.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/SDRAM_DISCO_F746NG.lib.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..e2ccd7b81b28df938f19638f953b500c387594dc
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/SDRAM_DISCO_F746NG.lib.tpl
@@ -0,0 +1 @@
+https://os.mbed.com/teams/ST/code/SDRAM_DISCO_F746NG/#370f402a2219
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/mbed-os.lib.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/mbed-os.lib.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..69fff22f335953f62576d3408fbf15e24be5280f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/mbed-os.lib.tpl
@@ -0,0 +1 @@
+https://github.com/ARMmbed/mbed-os/#6a0a86538c0b9b2bfcc4583b1e2b7fea8f4e71e9
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/mbed_app.json.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/mbed_app.json.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..1c547369fb2784b27a9152ba4b7ade77c12211b0
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/mbed_app.json.tpl
@@ -0,0 +1,7 @@
+{
+    "config": {
+	"main-stack-size": {
+            "value": 65536
+	}
+    }
+}
diff --git a/tensorflow/lite/experimental/writer/option_writer_generator.cc b/tensorflow/lite/experimental/writer/option_writer_generator.cc
index fa360a2f47e3dba34e05d2e32616821294f0e678..73742494762b8af9a9a08cd24c6eae1ac25fd426 100644
--- a/tensorflow/lite/experimental/writer/option_writer_generator.cc
+++ b/tensorflow/lite/experimental/writer/option_writer_generator.cc
@@ -69,6 +69,7 @@ static const char* param_structs[] = {"TfLiteConvParams",
                                       "TfLiteOneHotParams",
                                       "TfLiteLeakyReluParams",
                                       "TfLiteMirrorPaddingParams",
+                                      "TfLiteUniqueParams",
                                       nullptr};
 }  // namespace
 
@@ -156,6 +157,7 @@ class OpOptionData {
     op_to_option_["UNIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
     op_to_option_["UNIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
     op_to_option_["MIRROR_PAD"] = "";  // TODO(karimnosseir): MirrorPadOptions.
+    op_to_option_["UNIQUE"] = "";      // TODO(karimnosseir): UniqueOptions.
     // Manually specified mappings between ops and options (none)
     op_to_option_["EMBEDDING_LOOKUP"] =
         "";  // TODO(aselle): maybe something else.
diff --git a/tensorflow/lite/g3doc/apis.md b/tensorflow/lite/g3doc/apis.md
index b15159ce4145727863c335126557e06402f8dbd3..1a05142bc44b824e090fd6eb513360837eac2c69 100644
--- a/tensorflow/lite/g3doc/apis.md
+++ b/tensorflow/lite/g3doc/apis.md
@@ -1,4 +1,3 @@
-
 # TensorFlow Lite APIs
 
 TensorFlow Lite provides programming APIs in C++ and Java, and in both cases
@@ -8,8 +7,7 @@ no surprise that the APIs try to avoid unnecessary copies at the expense of
 convenience.  Similarly, consistency with TensorFlow APIs was not an explicit
 goal and some variance is to be expected.
 
-There is also a Python API for TensorFlow Lite described
-[here](../toco/g3doc/python_api.md#interpreter).
+There is also a [Python API for TensorFlow Lite](g3doc/convert/python_api.md).
 
 ## C++
 
diff --git a/tensorflow/lite/g3doc/demo_ios.md b/tensorflow/lite/g3doc/demo_ios.md
index fbf1dd6392591183d0dc484018bba501de1851d8..f4b481dc6192db703dea4161ed28e2fd63812ebf 100644
--- a/tensorflow/lite/g3doc/demo_ios.md
+++ b/tensorflow/lite/g3doc/demo_ios.md
@@ -7,22 +7,23 @@ instructions walk you through building and running the demo on an iOS device.
 
 ## Prerequisites
 
-* You must have [Xcode](https://developer.apple.com/xcode/) installed and have a
-  valid Apple Developer ID, and have an iOS device set up and linked to your
-  developer account with all of the appropriate certificates. For these
-  instructions, we assume that you have already been able to build and deploy an
-  app to an iOS device with your current developer environment.
+*   You must have [Xcode](https://developer.apple.com/xcode/) installed and have
+    a valid Apple Developer ID, and have an iOS device set up and linked to your
+    developer account with all of the appropriate certificates. For these
+    instructions, we assume that you have already been able to build and deploy
+    an app to an iOS device with your current developer environment.
 
-* The demo app requires a camera and must be executed on a real iOS device. You
-  can build it and run with the iPhone Simulator but it won't have any camera
-  information to classify.
+*   The demo app requires a camera and must be executed on a real iOS device.
+    You can build it and run with the iPhone Simulator but it won't have any
+    camera information to classify.
 
-* You don't need to build the entire TensorFlow library to run the demo, but you
-  will need to clone the TensorFlow repository if you haven't already:
+*   You don't need to build the entire TensorFlow library to run the demo, but
+    you will need to clone the TensorFlow repository if you haven't already:
 
         git clone https://github.com/tensorflow/tensorflow
+        cd tensorflow
 
-* You'll also need the Xcode command-line tools:
+*   You'll also need the Xcode command-line tools:
 
         xcode-select --install
 
@@ -31,35 +32,41 @@ instructions walk you through building and running the demo on an iOS device.
 
 ## Building the iOS Demo App
 
-1. Install CocoaPods if you don't have it:
+1.  Install CocoaPods if you don't have it:
 
         sudo gem install cocoapods
 
-2. Download the model files used by the demo app (this is done from inside the
-   cloned directory):
+2.  Download the model files used by the demo app (this is done from inside the
+    cloned directory):
 
         sh tensorflow/lite/examples/ios/download_models.sh
 
-3. Install the pod to generate the workspace file:
+3.  Install the pod to generate the workspace file:
 
         cd tensorflow/lite/examples/ios/camera
         pod install
 
     If you have installed this pod before and that command doesn't work, try
 
-        pod update
+        pod repo update
 
-    At the end of this step you should have a file called 
+    At the end of this step you should have a file called
     `tflite_camera_example.xcworkspace`.
 
-4. Open the project in Xcode by typing this on the command line:
+4.  Open the project in Xcode by typing this on the command line:
 
         open tflite_camera_example.xcworkspace
 
     This launches Xcode if it isn't open already and opens the
     `tflite_camera_example` project.
 
-5. Build and run the app in Xcode.
+5.  Under `Project navigator -> tflite_camera_example -> Targets ->
+    tflite_camera_example -> General` change the bundle identifier by
+    pre-pending your name:
+
+    ![pre-pend your name to the bundle identifier](images/ios/bundle_identifier.png)
+
+6.  Build and run the app in Xcode.
 
     Note that as mentioned earlier, you must already have a device set up and
     linked to your Apple Developer account in order to deploy the app on a
diff --git a/tensorflow/lite/g3doc/images/convert/sample_after.png b/tensorflow/lite/g3doc/images/convert/sample_after.png
index 6c451f97903f7f70a9f28dee8abf6daeb7ec5693..db09d0a6ca70695205833acfd2bd8ac6682cb065 100644
Binary files a/tensorflow/lite/g3doc/images/convert/sample_after.png and b/tensorflow/lite/g3doc/images/convert/sample_after.png differ
diff --git a/tensorflow/lite/g3doc/images/convert/sample_before.png b/tensorflow/lite/g3doc/images/convert/sample_before.png
index e5317ef295062e79c66430512ef1c45925858ce0..55440d324977f0ff5b795bc80898857918066e96 100644
Binary files a/tensorflow/lite/g3doc/images/convert/sample_before.png and b/tensorflow/lite/g3doc/images/convert/sample_before.png differ
diff --git a/tensorflow/lite/g3doc/images/ios/bundle_identifier.png b/tensorflow/lite/g3doc/images/ios/bundle_identifier.png
new file mode 100644
index 0000000000000000000000000000000000000000..398763916b353e61f236392e2b8898aad2aafe8e
Binary files /dev/null and b/tensorflow/lite/g3doc/images/ios/bundle_identifier.png differ
diff --git a/tensorflow/lite/g3doc/tfmobile/android_build.md b/tensorflow/lite/g3doc/tfmobile/android_build.md
index 2eb776d10cf8ec68987d13b580eddf2f1bda8e78..f8c0243298e435382a7514e72ada89880fb00c1c 100644
--- a/tensorflow/lite/g3doc/tfmobile/android_build.md
+++ b/tensorflow/lite/g3doc/tfmobile/android_build.md
@@ -91,10 +91,10 @@ following lines to your Gradle build file:
         repositories {
             jcenter()
         }
-	}
+    }
 
     dependencies {
-        compile 'org.tensorflow:tensorflow-android:+'
+        implementation 'org.tensorflow:tensorflow-android:+'
     }
 
 This automatically downloads the latest stable version of TensorFlow as an AAR
diff --git a/tensorflow/lite/g3doc/using_select_tf_ops.md b/tensorflow/lite/g3doc/using_select_tf_ops.md
index aa51f58baa4ecf01fbe75d2ce9095bb1a5286ae8..269774a4b10648f92aab5ee6bf5ae3687c263f75 100644
--- a/tensorflow/lite/g3doc/using_select_tf_ops.md
+++ b/tensorflow/lite/g3doc/using_select_tf_ops.md
@@ -130,7 +130,7 @@ allprojects {
 }
 
 dependencies {
-    compile 'org.tensorflow:tensorflow-lite-with-select-tf-ops:0.1.100'
+    implementation 'org.tensorflow:tensorflow-lite-with-select-tf-ops:0.1.100'
 }
 ```
 
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index 2abe062ec67a4b050b51f825769a1669bb836ea3..60fa2130fabaa692d23c109f42fa8883f6e8de19 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -102,15 +102,16 @@ TfLiteStatus Interpreter::ResizeInputTensor(int tensor_index,
 }
 
 TfLiteStatus Interpreter::Invoke() {
-  TfLiteStatus status = primary_subgraph().Invoke();
+  TF_LITE_ENSURE_STATUS(primary_subgraph().Invoke());
 
   if (!allow_buffer_handle_output_) {
     for (int tensor_index : outputs()) {
-      primary_subgraph().EnsureTensorDataIsReadable(tensor_index);
+      TF_LITE_ENSURE_STATUS(
+          primary_subgraph().EnsureTensorDataIsReadable(tensor_index));
     }
   }
 
-  return status;
+  return kTfLiteOk;
 }
 
 TfLiteStatus Interpreter::AddTensors(int tensors_to_add,
diff --git a/tensorflow/lite/java/demo/app/build.gradle b/tensorflow/lite/java/demo/app/build.gradle
index b8fc282cb1dfe8a9c80692759e985bf369fc163d..8ea16a3417ca9733f518776692114501c4162a0e 100644
--- a/tensorflow/lite/java/demo/app/build.gradle
+++ b/tensorflow/lite/java/demo/app/build.gradle
@@ -2,7 +2,7 @@ apply plugin: 'com.android.application'
 
 android {
     compileSdkVersion 26
-    buildToolsVersion "26.0.1"
+    buildToolsVersion "27.0.3"
     defaultConfig {
         applicationId "android.example.com.tflitecamerademo"
         // Required by Camera2 API.
@@ -10,11 +10,6 @@ android {
         targetSdkVersion 26
         versionCode 1
         versionName "1.0"
-
-        // Remove this block.
-        jackOptions {
-            enabled true
-        }
     }
     lintOptions {
         abortOnError false
@@ -40,6 +35,7 @@ repositories {
         url 'https://google.bintray.com/tensorflow'
     }
 }
+
 allprojects {
     repositories {
         // Uncomment if you want to use a local repo.
@@ -48,20 +44,18 @@ allprojects {
     }
 }
 
-
-
 dependencies {
-    compile fileTree(dir: 'libs', include: ['*.jar'])
-    compile 'com.android.support:appcompat-v7:25.2.0'
-    compile 'com.android.support.constraint:constraint-layout:1.0.2'
-    compile 'com.android.support:design:25.2.0'
-    compile 'com.android.support:support-annotations:25.3.1'
-    compile 'com.android.support:support-v13:25.2.0'
+    implementation fileTree(dir: 'libs', include: ['*.jar'])
+    implementation 'com.android.support:appcompat-v7:25.2.0'
+    implementation 'com.android.support.constraint:constraint-layout:1.0.2'
+    implementation 'com.android.support:design:25.2.0'
+    implementation 'com.android.support:support-annotations:25.3.1'
+    implementation 'com.android.support:support-v13:25.2.0'
 
     // Build off of nightly TensorFlow Lite
-    compile 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
+    implementation 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
     // Use local TensorFlow library
-    // compile 'org.tensorflow:tensorflow-lite-local:0.0.0'
+    // implementation 'org.tensorflow:tensorflow-lite-local:0.0.0'
 }
 
 def targetFolder = "src/main/assets"
diff --git a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
index 165d33510131ac9c9fc08070f0a4d08653188fae..a7b34405369a5990535593b6ca2f30cb5f348440 100644
--- a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
@@ -476,7 +476,9 @@ public class Camera2BasicFragment extends Fragment
 
   @Override
   public void onDestroy() {
-    classifier.close();
+    if (classifier != null) {
+      classifier.close();
+    }
     super.onDestroy();
   }
 
diff --git a/tensorflow/lite/java/demo/build.gradle b/tensorflow/lite/java/demo/build.gradle
index b78a0b86c939620b6f05483ce45c4d3ef0ef595e..a88b3fdc70d9bbd45fa15ad31b4d38a377621c16 100644
--- a/tensorflow/lite/java/demo/build.gradle
+++ b/tensorflow/lite/java/demo/build.gradle
@@ -2,10 +2,11 @@
 
 buildscript {
     repositories {
+        google()
         jcenter()
     }
     dependencies {
-        classpath 'com.android.tools.build:gradle:2.3.1'
+        classpath 'com.android.tools.build:gradle:3.1.4'
 
         // NOTE: Do not place your application dependencies here; they belong
         // in the individual module build.gradle files
@@ -14,6 +15,7 @@ buildscript {
 
 allprojects {
     repositories {
+        google()
         jcenter()
     }
 }
diff --git a/tensorflow/lite/java/demo/gradle/wrapper/gradle-wrapper.properties b/tensorflow/lite/java/demo/gradle/wrapper/gradle-wrapper.properties
index fa7a38a0e43eecd1e7292dd49efa79a5d0742e2a..9ff32fe2bb7afeaefdc8b3d6a1ecb0d32e1aed60 100644
--- a/tensorflow/lite/java/demo/gradle/wrapper/gradle-wrapper.properties
+++ b/tensorflow/lite/java/demo/gradle/wrapper/gradle-wrapper.properties
@@ -3,4 +3,4 @@ distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
 zipStoreBase=GRADLE_USER_HOME
 zipStorePath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-3.3-all.zip
+distributionUrl=https\://services.gradle.org/distributions/gradle-4.4-all.zip
diff --git a/tensorflow/lite/java/ovic/demo/app/build.gradle b/tensorflow/lite/java/ovic/demo/app/build.gradle
index 4f3a6cdb2f8fe58008c9315bf08f4d328e720073..77f568448a810c61ece9feef65fad422356be2f1 100644
--- a/tensorflow/lite/java/ovic/demo/app/build.gradle
+++ b/tensorflow/lite/java/ovic/demo/app/build.gradle
@@ -2,18 +2,13 @@ apply plugin: 'com.android.application'
 
 android {
     compileSdkVersion 26
-    buildToolsVersion "26.0.1"
+    buildToolsVersion "27.0.3"
     defaultConfig {
         applicationId "android.example.com.ovicbenchmarker"
         minSdkVersion 15
         targetSdkVersion 26
         versionCode 1
         versionName "1.0"
-
-        // Remove this block.
-        jackOptions {
-            enabled true
-        }
     }
     lintOptions {
         abortOnError false
@@ -41,12 +36,12 @@ repositories {
 }
 
 dependencies {
-    compile fileTree(dir: 'libs', include: ['*.jar'])
-    compile 'com.android.support:appcompat-v7:25.2.0'
-    compile 'com.android.support.constraint:constraint-layout:1.0.2'
-    compile 'com.android.support:design:25.2.0'
-    compile 'com.android.support:support-annotations:25.3.1'
-    compile 'com.android.support:support-v13:25.2.0'
+    implementation fileTree(dir: 'libs', include: ['*.jar'])
+    implementation 'com.android.support:appcompat-v7:25.2.0'
+    implementation 'com.android.support.constraint:constraint-layout:1.0.2'
+    implementation 'com.android.support:design:25.2.0'
+    implementation 'com.android.support:support-annotations:25.3.1'
+    implementation 'com.android.support:support-v13:25.2.0'
 
-    compile 'org.tensorflow:tensorflow-lite:+'
+    implementation 'org.tensorflow:tensorflow-lite:+'
 }
diff --git a/tensorflow/lite/java/ovic/demo/build.gradle b/tensorflow/lite/java/ovic/demo/build.gradle
index b78a0b86c939620b6f05483ce45c4d3ef0ef595e..a88b3fdc70d9bbd45fa15ad31b4d38a377621c16 100644
--- a/tensorflow/lite/java/ovic/demo/build.gradle
+++ b/tensorflow/lite/java/ovic/demo/build.gradle
@@ -2,10 +2,11 @@
 
 buildscript {
     repositories {
+        google()
         jcenter()
     }
     dependencies {
-        classpath 'com.android.tools.build:gradle:2.3.1'
+        classpath 'com.android.tools.build:gradle:3.1.4'
 
         // NOTE: Do not place your application dependencies here; they belong
         // in the individual module build.gradle files
@@ -14,6 +15,7 @@ buildscript {
 
 allprojects {
     repositories {
+        google()
         jcenter()
     }
 }
diff --git a/tensorflow/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.properties b/tensorflow/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.properties
index fa7a38a0e43eecd1e7292dd49efa79a5d0742e2a..9ff32fe2bb7afeaefdc8b3d6a1ecb0d32e1aed60 100644
--- a/tensorflow/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.properties
+++ b/tensorflow/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.properties
@@ -3,4 +3,4 @@ distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
 zipStoreBase=GRADLE_USER_HOME
 zipStorePath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-3.3-all.zip
+distributionUrl=https\://services.gradle.org/distributions/gradle-4.4-all.zip
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 5cc06c7a6337101887d5064656cf92ea5184e9cc..7a4b6b8644be52274f298f6a23c55d677fcfdd35 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -25,9 +25,6 @@ tf_cc_test(
     name = "optional_tensor_test",
     size = "small",
     srcs = ["optional_tensor_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -122,9 +119,6 @@ tf_cc_test(
     name = "kernel_util_test",
     size = "small",
     srcs = ["kernel_util_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":kernel_util",
         "//tensorflow/lite/testing:util",
@@ -136,9 +130,6 @@ tf_cc_test(
     name = "test_util_test",
     size = "small",
     srcs = ["test_util_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",  # TODO(b/117786830)
-    ],
     deps = [
         ":test_util",
         "//tensorflow/lite/testing:util",
@@ -230,6 +221,7 @@ cc_library(
         "transpose_conv.cc",
         "unidirectional_sequence_lstm.cc",
         "unidirectional_sequence_rnn.cc",
+        "unique.cc",
         "unpack.cc",
         "zeros_like.cc",
     ],
@@ -304,9 +296,6 @@ tf_cc_test(
     name = "audio_spectrogram_test",
     size = "small",
     srcs = ["audio_spectrogram_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -320,9 +309,6 @@ tf_cc_test(
     name = "mfcc_test",
     size = "small",
     srcs = ["mfcc_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -336,9 +322,6 @@ tf_cc_test(
     name = "detection_postprocess_test",
     size = "small",
     srcs = ["detection_postprocess_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -352,9 +335,6 @@ tf_cc_test(
     name = "relu1_test",
     size = "small",
     srcs = ["relu1_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -368,13 +348,11 @@ tf_cc_test(
     name = "sparse_output_fully_connected_test",
     size = "small",
     srcs = ["sparse_output_fully_connected_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/kernels/internal:types",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -384,7 +362,6 @@ tf_cc_test(
     name = "activations_test",
     size = "small",
     srcs = ["activations_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -397,7 +374,6 @@ tf_cc_test(
     name = "add_test",
     size = "small",
     srcs = ["add_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -410,9 +386,6 @@ tf_cc_test(
     name = "arg_min_max_test",
     size = "small",
     srcs = ["arg_min_max_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -425,9 +398,6 @@ tf_cc_test(
     name = "div_test",
     size = "small",
     srcs = ["div_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -440,9 +410,6 @@ tf_cc_test(
     name = "sub_test",
     size = "small",
     srcs = ["sub_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -455,9 +422,6 @@ tf_cc_test(
     name = "transpose_test",
     size = "small",
     srcs = ["transpose_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -472,9 +436,6 @@ tf_cc_test(
     name = "space_to_batch_nd_test",
     size = "small",
     srcs = ["space_to_batch_nd_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -487,9 +448,6 @@ tf_cc_test(
     name = "batch_to_space_nd_test",
     size = "small",
     srcs = ["batch_to_space_nd_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -502,9 +460,6 @@ tf_cc_test(
     name = "cast_test",
     size = "small",
     srcs = ["cast_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -517,7 +472,6 @@ tf_cc_test(
     name = "concatenation_test",
     size = "small",
     srcs = ["concatenation_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -530,7 +484,6 @@ tf_cc_test(
     name = "conv_test",
     size = "small",
     srcs = ["conv_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -544,7 +497,6 @@ tf_cc_test(
     name = "depthwise_conv_test",
     size = "small",
     srcs = ["depthwise_conv_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -558,13 +510,11 @@ tf_cc_test(
     name = "dequantize_test",
     size = "small",
     srcs = ["dequantize_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/kernels/internal:types",
         "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
     ],
@@ -574,7 +524,6 @@ tf_cc_test(
     name = "basic_rnn_test",
     size = "small",
     srcs = ["basic_rnn_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -587,9 +536,6 @@ tf_cc_test(
     name = "bidirectional_sequence_lstm_test",
     size = "small",
     srcs = ["bidirectional_sequence_lstm_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -603,9 +549,6 @@ tf_cc_test(
     name = "floor_test",
     size = "small",
     srcs = ["floor_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -618,9 +561,6 @@ tf_cc_test(
     name = "elementwise_test",
     size = "small",
     srcs = ["elementwise_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -633,9 +573,6 @@ tf_cc_test(
     name = "unidirectional_sequence_lstm_test",
     size = "small",
     srcs = ["unidirectional_sequence_lstm_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -648,9 +585,6 @@ tf_cc_test(
     name = "bidirectional_sequence_rnn_test",
     size = "small",
     srcs = ["bidirectional_sequence_rnn_test.cc"],
-    tags = [
-        "tflite_not_portable",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -663,9 +597,6 @@ tf_cc_test(
     name = "unidirectional_sequence_rnn_test",
     size = "small",
     srcs = ["unidirectional_sequence_rnn_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -678,7 +609,6 @@ tf_cc_test(
     name = "l2norm_test",
     size = "small",
     srcs = ["l2norm_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -691,9 +621,6 @@ tf_cc_test(
     name = "exp_test",
     size = "small",
     srcs = ["exp_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -706,9 +633,6 @@ tf_cc_test(
     name = "fake_quant_test",
     size = "small",
     srcs = ["fake_quant_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -721,9 +645,6 @@ tf_cc_test(
     name = "maximum_minimum_test",
     size = "small",
     srcs = ["maximum_minimum_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -736,9 +657,6 @@ tf_cc_test(
     name = "reduce_test",
     size = "small",
     srcs = ["reduce_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -751,7 +669,6 @@ tf_cc_test(
     name = "mul_test",
     size = "small",
     srcs = ["mul_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -764,9 +681,6 @@ tf_cc_test(
     name = "pad_test",
     size = "small",
     srcs = ["pad_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -779,7 +693,6 @@ tf_cc_test(
     name = "reshape_test",
     size = "small",
     srcs = ["reshape_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -792,9 +705,6 @@ tf_cc_test(
     name = "gather_test",
     size = "small",
     srcs = ["gather_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -808,9 +718,6 @@ tf_cc_test(
     name = "topk_v2_test",
     size = "small",
     srcs = ["topk_v2_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -824,7 +731,6 @@ tf_cc_test(
     name = "resize_bilinear_test",
     size = "small",
     srcs = ["resize_bilinear_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -837,7 +743,6 @@ tf_cc_test(
     name = "resize_nearest_neighbor_test",
     size = "small",
     srcs = ["resize_nearest_neighbor_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -850,7 +755,6 @@ tf_cc_test(
     name = "svdf_test",
     size = "small",
     srcs = ["svdf_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -863,7 +767,6 @@ tf_cc_test(
     name = "embedding_lookup_test",
     size = "small",
     srcs = ["embedding_lookup_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -876,7 +779,6 @@ tf_cc_test(
     name = "embedding_lookup_sparse_test",
     size = "small",
     srcs = ["embedding_lookup_sparse_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -889,7 +791,6 @@ tf_cc_test(
     name = "fully_connected_test",
     size = "small",
     srcs = ["fully_connected_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -904,7 +805,6 @@ tf_cc_test(
     name = "local_response_norm_test",
     size = "small",
     srcs = ["local_response_norm_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -917,7 +817,6 @@ tf_cc_test(
     name = "pooling_test",
     size = "small",
     srcs = ["pooling_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -930,7 +829,6 @@ tf_cc_test(
     name = "softmax_test",
     size = "small",
     srcs = ["softmax_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -944,9 +842,6 @@ tf_cc_test(
     name = "log_softmax_test",
     size = "small",
     srcs = ["log_softmax_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -960,7 +855,6 @@ tf_cc_test(
     name = "lsh_projection_test",
     size = "small",
     srcs = ["lsh_projection_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -973,7 +867,6 @@ tf_cc_test(
     name = "hashtable_lookup_test",
     size = "small",
     srcs = ["hashtable_lookup_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -987,7 +880,6 @@ tf_cc_test(
     name = "layer_norm_lstm_test",
     size = "small",
     srcs = ["layer_norm_lstm_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1001,7 +893,6 @@ tf_cc_test(
     name = "lstm_test",
     size = "small",
     srcs = ["lstm_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1014,7 +905,6 @@ tf_cc_test(
     name = "skip_gram_test",
     size = "small",
     srcs = ["skip_gram_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1028,7 +918,6 @@ tf_cc_test(
     name = "space_to_depth_test",
     size = "small",
     srcs = ["space_to_depth_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1041,9 +930,6 @@ tf_cc_test(
     name = "split_test",
     size = "small",
     srcs = ["split_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1056,9 +942,6 @@ tf_cc_test(
     name = "split_v_test",
     size = "small",
     srcs = ["split_v_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1071,9 +954,6 @@ tf_cc_test(
     name = "squeeze_test",
     size = "small",
     srcs = ["squeeze_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1086,9 +966,6 @@ tf_cc_test(
     name = "strided_slice_test",
     size = "small",
     srcs = ["strided_slice_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1101,9 +978,6 @@ tf_cc_test(
     name = "tile_test",
     size = "small",
     srcs = ["tile_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1119,9 +993,6 @@ tf_cc_test(
     srcs = [
         "comparisons_test.cc",
     ],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1134,9 +1005,6 @@ tf_cc_test(
     name = "neg_test",
     size = "small",
     srcs = ["neg_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1151,9 +1019,6 @@ tf_cc_test(
     srcs = [
         "select_test.cc",
     ],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1168,9 +1033,6 @@ tf_cc_test(
     srcs = [
         "slice_test.cc",
     ],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1183,9 +1045,6 @@ tf_cc_test(
     name = "transpose_conv_test",
     size = "small",
     srcs = ["transpose_conv_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1199,9 +1058,6 @@ tf_cc_test(
     name = "expand_dims_test",
     size = "small",
     srcs = ["expand_dims_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1215,9 +1071,6 @@ tf_cc_test(
     name = "sparse_to_dense_test",
     size = "small",
     srcs = ["sparse_to_dense_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1231,9 +1084,6 @@ tf_cc_test(
     name = "shape_test",
     size = "small",
     srcs = ["shape_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1247,9 +1097,6 @@ tf_cc_test(
     name = "pow_test",
     size = "small",
     srcs = ["pow_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1263,7 +1110,6 @@ tf_cc_test(
     name = "pack_test",
     size = "small",
     srcs = ["pack_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1277,7 +1123,6 @@ tf_cc_test(
     name = "one_hot_test",
     size = "small",
     srcs = ["one_hot_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1290,7 +1135,6 @@ tf_cc_test(
     name = "logical_test",
     size = "small",
     srcs = ["logical_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1304,7 +1148,6 @@ tf_cc_test(
     name = "unpack_test",
     size = "small",
     srcs = ["unpack_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:builtin_op_data",
@@ -1318,7 +1161,6 @@ tf_cc_test(
     name = "floor_div_test",
     size = "small",
     srcs = ["floor_div_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:builtin_op_data",
@@ -1332,7 +1174,6 @@ tf_cc_test(
     name = "zeros_like_test",
     size = "small",
     srcs = ["zeros_like_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:builtin_op_data",
@@ -1346,7 +1187,6 @@ tf_cc_test(
     name = "floor_mod_test",
     size = "small",
     srcs = ["floor_mod_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:builtin_op_data",
@@ -1360,7 +1200,6 @@ tf_cc_test(
     name = "range_test",
     size = "small",
     srcs = ["range_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:builtin_op_data",
@@ -1374,7 +1213,6 @@ tf_cc_test(
     name = "squared_difference_test",
     size = "small",
     srcs = ["squared_difference_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1387,7 +1225,6 @@ tf_cc_test(
     name = "fill_test",
     size = "small",
     srcs = ["fill_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1396,6 +1233,17 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "unique_test",
+    srcs = ["unique_test.cc"],
+    deps = [
+        ":builtin_ops",
+        ":test_util",
+        "//tensorflow/lite:framework",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 13ef94c016698421bede8d3780043cf5c5ae4a0e..4463a6c5a65bf848ad68635717750d3a214dd0a0 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -57,9 +57,9 @@ struct PreluOpData : public OpData {
 };
 
 namespace {
-TfLiteStatus CheckInputQuantParams(TfLiteContext* context,
-                                   const TfLiteTensor* input,
-                                   const TfLiteTensor* output) {
+TfLiteStatus CheckOutputQuantParams(TfLiteContext* context,
+                                    const TfLiteTensor* input,
+                                    const TfLiteTensor* output) {
   if (input->type == kTfLiteUInt8) {
     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
     TF_LITE_ENSURE(context, output->params.scale == 1. / 256);
@@ -236,7 +236,7 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, num_dims >= 1 && num_dims <= 4);
 
   if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
-    if (CheckInputQuantParams(context, input, output) == kTfLiteError) {
+    if (CheckOutputQuantParams(context, input, output) == kTfLiteError) {
       return kTfLiteError;
     }
 
diff --git a/tensorflow/lite/kernels/basic_rnn_test.cc b/tensorflow/lite/kernels/basic_rnn_test.cc
index c71849fff3d0457f9d39bf62907fb7b9e22c1059..9eb20444a6d119ec940a140a66e59961f1451c1c 100644
--- a/tensorflow/lite/kernels/basic_rnn_test.cc
+++ b/tensorflow/lite/kernels/basic_rnn_test.cc
@@ -240,7 +240,7 @@ class HybridRNNOpModel : public RNNOpModel {
 
   TensorType tensor_type_;
 
-  void SetWeights(int weights_idx, std::vector<float> f) {
+  void SetWeights(int weights_idx, const std::vector<float>& f) {
     if (tensor_type_ == TensorType_UINT8) {
       SymmetricQuantizeAndPopulate(weights_idx, f);
     } else {
diff --git a/tensorflow/lite/kernels/batch_to_space_nd_test.cc b/tensorflow/lite/kernels/batch_to_space_nd_test.cc
index a3e06d4c89327050625ac514d41bc29c4f6493f3..f33089559992c1a6a6fa34161122c43b7954fbdb 100644
--- a/tensorflow/lite/kernels/batch_to_space_nd_test.cc
+++ b/tensorflow/lite/kernels/batch_to_space_nd_test.cc
@@ -114,6 +114,7 @@ TEST(BatchToSpaceNDOpTest, SimpleDynamicTest) {
                                                4, 8, 11, 15, 12, 16}));
 }
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(BatchToSpaceNDOpTest, InvalidShapeTest) {
   EXPECT_DEATH(BatchToSpaceNDOpConstModel({3, 2, 2, 1}, {2, 2}, {0, 0, 0, 0}),
                "Cannot allocate tensors");
@@ -131,6 +132,7 @@ TEST(BatchToSpaceNDOpTest, InvalidCropsDynamicTest) {
   m.SetCrops({0, 0, -1, 0});
   EXPECT_DEATH(m.Invoke(), "crops.2. >= 0 was not true.");
 }
+#endif
 
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
index 1ddfe7201ead5cd9840bf2c982793f11cb21bdbf..31c6e3f44c8323cee38d196b4cd24031586ad1b0 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
@@ -105,7 +105,10 @@ constexpr int kBwInputActivationStateTensor = 37;
 // Cell state tensors of size {n_batch, n_cell}
 constexpr int kBwInputCellStateTensor = 38;
 
-// Auxiliary input and weights when stacking.
+// Used as auxiliary input and weights when stacking for
+// tf.contrib.rnn.stack_bidirectional_rnn case (with cross links); Used as input
+// to the backward cell when stacking for tf.nn.static_bidirectional_rnn case
+// (without cross links).
 constexpr int kAuxInputTensor = 39;  // Optional
 // Forward weights.
 constexpr int kFwAuxInputToInputWeightsTensor = 40;   // Optional
@@ -459,8 +462,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* bw_aux_input_to_output_weights =
       GetOptionalInputTensor(context, node, kBwAuxInputToOutputWeightsTensor);
 
-  const bool aux_inputs_all_or_none =
-      ((aux_input != nullptr) && (fw_aux_input_to_cell_weights != nullptr) &&
+  const bool aux_inputs_weights_all_or_none =
+      ((fw_aux_input_to_cell_weights != nullptr) &&
        (fw_aux_input_to_forget_weights != nullptr) &&
        (fw_aux_input_to_output_weights != nullptr) &&
        (bw_aux_input_to_cell_weights != nullptr) &&
@@ -472,8 +475,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
        (bw_aux_input_to_cell_weights == nullptr) &&
        (bw_aux_input_to_forget_weights == nullptr) &&
        (bw_aux_input_to_output_weights == nullptr));
-  TF_LITE_ENSURE(context, aux_inputs_all_or_none);
-  const bool has_aux_input = (aux_input != nullptr);
+  TF_LITE_ENSURE(context, aux_inputs_weights_all_or_none);
+
+  const bool has_aux_input = (fw_aux_input_to_forget_weights != nullptr);
 
   if (has_aux_input) {
     // Check that aux_input has the same dimensions (except last) as the input.
@@ -870,6 +874,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* bw_aux_input_to_output_weights =
       GetOptionalInputTensor(context, node, kBwAuxInputToOutputWeightsTensor);
 
+  const bool has_previous_bw_output = (aux_input != nullptr);
+  const bool use_aux_input = (fw_aux_input_to_forget_weights != nullptr);
+
   // Populate a TfLiteLSTMParams struct for the evaluation functions.
   TfLiteLSTMParams lstm_params = {params->activation, params->cell_clip,
                                   params->proj_clip, kTfLiteLSTMFullKernel};
@@ -879,6 +886,26 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const auto actual_bw_output = params->merge_outputs ? fw_output : bw_output;
 
   const bool time_major = params->time_major;
+
+  // We want to cover the following cases:
+  //
+  // If not stacking (not connected after other bidi lstms):
+  //   both fw & bw will just use `input`; aux_input will be null.
+  //
+  // If stacking with cross_links, TensorFlow equivalent
+  // (tf.contrib.rnn.stack_bidirectional_rnn):
+  //   both fw & bw will use `input`, but aux_input will be none null.
+  //   Note, this time, whether connected after other bidi lstms both works.
+  //
+  // If stacking without cross_links, but connected after other bidi lstms,
+  // TensorFlow equivalent (tf.nn.static_bidirectional_rnn):
+  //   fw will use `input`, bw will use aux_input, and the `real aux_input`
+  //   will be null.
+
+  const bool non_stacking_mode = !use_aux_input && has_previous_bw_output;
+  const TfLiteTensor* bw_input = non_stacking_mode ? aux_input : input;
+  const TfLiteTensor* real_aux_input = non_stacking_mode ? nullptr : aux_input;
+
   switch (fw_input_to_output_weights->type) {
     case kTfLiteFloat32: {
       TfLiteStatus fw_pass_status = lstm_eval::EvalFloat(
@@ -891,7 +918,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*input_layer_norm_coefficients=*/nullptr,
           /*forget_layer_norm_coefficients=*/nullptr,
           /*cell_layer_norm_coefficients=*/nullptr,
-          /*output_layer_norm_coefficients=*/nullptr, aux_input,
+          /*output_layer_norm_coefficients=*/nullptr, real_aux_input,
           fw_aux_input_to_input_weights, fw_aux_input_to_forget_weights,
           fw_aux_input_to_cell_weights, fw_aux_input_to_output_weights,
           fw_input_gate_bias, fw_forget_gate_bias, fw_cell_bias,
@@ -902,7 +929,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_ENSURE_OK(context, fw_pass_status);
 
       TfLiteStatus bw_pass_status = lstm_eval::EvalFloat(
-          input, bw_input_to_input_weights, bw_input_to_forget_weights,
+          bw_input, bw_input_to_input_weights, bw_input_to_forget_weights,
           bw_input_to_cell_weights, bw_input_to_output_weights,
           bw_recurrent_to_input_weights, bw_recurrent_to_forget_weights,
           bw_recurrent_to_cell_weights, bw_recurrent_to_output_weights,
@@ -911,7 +938,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*input_layer_norm_coefficients=*/nullptr,
           /*forget_layer_norm_coefficients=*/nullptr,
           /*cell_layer_norm_coefficients=*/nullptr,
-          /*output_layer_norm_coefficients=*/nullptr, aux_input,
+          /*output_layer_norm_coefficients=*/nullptr, real_aux_input,
           bw_aux_input_to_input_weights, bw_aux_input_to_forget_weights,
           bw_aux_input_to_cell_weights, bw_aux_input_to_output_weights,
           bw_input_gate_bias, bw_forget_gate_bias, bw_cell_bias,
@@ -942,9 +969,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TfLiteTensor* recovered_cell_weights =
           GetTemporary(context, node, kRecoveredCellWeights);
       TfLiteTensor* aux_input_quantized =
-          (aux_input == nullptr)
-              ? nullptr
-              : GetTemporary(context, node, kAuxInputQuantized);
+          use_aux_input ? GetTemporary(context, node, kAuxInputQuantized)
+                        : nullptr;
 
       TfLiteStatus fw_pass_status = lstm_eval::EvalHybrid(
           input, fw_input_to_input_weights, fw_input_to_forget_weights,
@@ -956,7 +982,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*input_layer_norm_coefficients=*/nullptr,
           /*forget_layer_norm_coefficients=*/nullptr,
           /*cell_layer_norm_coefficients=*/nullptr,
-          /*output_layer_norm_coefficients=*/nullptr, aux_input,
+          /*output_layer_norm_coefficients=*/nullptr, real_aux_input,
           fw_aux_input_to_input_weights, fw_aux_input_to_forget_weights,
           fw_aux_input_to_cell_weights, fw_aux_input_to_output_weights,
           fw_input_gate_bias, fw_forget_gate_bias, fw_cell_bias,
@@ -970,7 +996,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_ENSURE_OK(context, fw_pass_status);
 
       TfLiteStatus bw_pass_status = lstm_eval::EvalHybrid(
-          input, bw_input_to_input_weights, bw_input_to_forget_weights,
+          bw_input, bw_input_to_input_weights, bw_input_to_forget_weights,
           bw_input_to_cell_weights, bw_input_to_output_weights,
           bw_recurrent_to_input_weights, bw_recurrent_to_forget_weights,
           bw_recurrent_to_cell_weights, bw_recurrent_to_output_weights,
@@ -979,7 +1005,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*input_layer_norm_coefficients=*/nullptr,
           /*forget_layer_norm_coefficients=*/nullptr,
           /*cell_layer_norm_coefficients=*/nullptr,
-          /*output_layer_norm_coefficients=*/nullptr, aux_input,
+          /*output_layer_norm_coefficients=*/nullptr, real_aux_input,
           bw_aux_input_to_input_weights, bw_aux_input_to_forget_weights,
           bw_aux_input_to_cell_weights, bw_aux_input_to_output_weights,
           bw_input_gate_bias, bw_forget_gate_bias, bw_cell_bias,
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
index f5df6d15af7912d663f61b9df93d92d4c029e2d5..59ea47a2a22f60482c0f95a917e46e8f7b61e5ec 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
@@ -38,7 +38,7 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
                            int sequence_length, bool use_cifg,
                            bool use_peephole, bool use_projection_weights,
                            bool use_projection_bias, bool merge_outputs,
-                           float cell_clip, float proj_clip,
+                           bool use_aux_input, float cell_clip, float proj_clip,
                            bool quantize_weights, bool time_major,
                            const std::vector<std::vector<int>>& input_shapes)
       : n_batch_(n_batch),
@@ -185,7 +185,11 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
       bw_output_ = AddOutput(TensorType_FLOAT32);
     }
 
-    aux_input_ = AddNullInput();
+    if (use_aux_input) {
+      aux_input_ = AddInput(TensorType_FLOAT32);
+    } else {
+      aux_input_ = AddNullInput();
+    }
     fw_aux_input_to_input_weights_ = AddNullInput();
     fw_aux_input_to_forget_weights_ = AddNullInput();
     fw_aux_input_to_cell_weights_ = AddNullInput();
@@ -302,6 +306,10 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
     PopulateTensor(input_, offset, begin, end);
   }
 
+  void SetAuxInput(int offset, float* begin, float* end) {
+    PopulateTensor(aux_input_, offset, begin, end);
+  }
+
   std::vector<float> GetFwOutput() { return ExtractVector<float>(fw_output_); }
   std::vector<float> GetBwOutput() { return ExtractVector<float>(bw_output_); }
 
@@ -406,7 +414,8 @@ TEST_P(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
       /*use_peephole=*/false, /*use_projection_weights=*/false,
-      /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false,
+      /*use_aux_input=*/false, /*cell_clip=*/0.0,
       /*proj_clip=*/0.0, quantize_weights, /*time_major=*/true,
       {
           {sequence_length, n_batch, n_input},  // input tensor
@@ -570,7 +579,8 @@ TEST_P(LSTMOpTest, BlackBoxTestMergedOutput) {
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
       /*use_peephole=*/false, /*use_projection_weights=*/false,
-      /*use_projection_bias=*/false, /*merge_outputs=*/true, /*cell_clip=*/0.0,
+      /*use_projection_bias=*/false, /*merge_outputs=*/true,
+      /*use_aux_input=*/false, /*cell_clip=*/0.0,
       /*proj_clip=*/0.0, quantize_weights, /*time_major=*/true,
       {
           {sequence_length, n_batch, n_input},  // input tensor
@@ -733,7 +743,8 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClippingReverse) {
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
       /*use_peephole=*/false, /*use_projection_weights=*/false,
-      /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false,
+      /*use_aux_input=*/false, /*cell_clip=*/0.0,
       /*proj_clip=*/0.0, /*quantize_weights=*/false, /*time_major=*/true,
       {
           {sequence_length, n_batch, n_input},  // input tensor
@@ -895,7 +906,8 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/true,
       /*use_peephole=*/true, /*use_projection_weights=*/false,
-      /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false,
+      /*use_aux_input=*/false, /*cell_clip=*/0.0,
       /*proj_clip=*/0.0, /*quantize_weights=*/false, /*time_major=*/true,
       {
           {sequence_length, n_batch, n_input},  // input tensor
@@ -1047,7 +1059,8 @@ TEST(LSTMOpTest,
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/true,
       /*use_peephole=*/true, /*use_projection_weights=*/false,
-      /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false,
+      /*use_aux_input=*/false, /*cell_clip=*/0.0,
       /*proj_clip=*/0.0, /*quantize_weights=*/false, /*time_major=*/true,
       {
           {sequence_length, n_batch, n_input},  // input tensor
@@ -1199,7 +1212,8 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) {
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
       /*use_peephole=*/true, /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false,
+      /*use_aux_input=*/false, /*cell_clip=*/0.0,
       /*proj_clip=*/0.0, /*quantize_weights=*/false, /*time_major=*/true,
       {
           {sequence_length, n_batch, n_input},  // input tensor
@@ -1903,7 +1917,8 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClippingBatchMajor) {
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
       /*use_peephole=*/true, /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false,
+      /*use_aux_input=*/false, /*cell_clip=*/0.0,
       /*proj_clip=*/0.0, /*quantize_weights=*/false, /*time_major=*/false,
       {
           {n_batch, sequence_length, n_input},  // input tensor
@@ -2590,6 +2605,175 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClippingBatchMajor) {
   EXPECT_THAT(combined, ElementsAreArray(ArrayFloatNear(expected)));
 }
 
+// Same as the no cifg no peephole no projection no clipping test, but have an
+// aux input (without aux input weights), this is the case when stacking but no
+// cross-links.
+TEST_P(LSTMOpTest, BlackBoxTestWithAuxInput) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+  const int sequence_length = 3;
+  const bool quantize_weights = GetParam();
+
+  BidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
+      /*use_peephole=*/false, /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false,
+      /*use_aux_input=*/true, /*cell_clip=*/0.0,
+      /*proj_clip=*/0.0, quantize_weights, /*time_major=*/true,
+      {
+          {sequence_length, n_batch, n_input},  // input tensor
+
+          // Forward cell
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},  // cell_to_input_weight tensor
+          {0},  // cell_to_forget_weight tensor
+          {0},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+
+          // Backward cell
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},  // cell_to_input_weight tensor
+          {0},  // cell_to_forget_weight tensor
+          {0},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+
+          // TODO(b/121134029): Update tests so tensor shapes after state tensor
+          // are used. They are currently ignored by test_util.
+          {sequence_length, n_batch, n_input},  // aux_input tensor
+          {n_cell, 0},                          // aux_fw_input_to_input tensor
+          {n_cell, 0},                          // aux_fw_input_to_forget tensor
+          {n_cell, 0},                          // aux_fw_input_to_cell tensor
+          {n_cell, 0},                          // aux_fw_input_to_output tensor
+          {n_cell, 0},                          // aux_bw_input_to_input tensor
+          {n_cell, 0},                          // aux_bw_input_to_forget tensor
+          {n_cell, 0},                          // aux_bw_input_to_cell tensor
+          {n_cell, 0},                          // aux_bw_input_to_output tensor
+      });
+
+  lstm.SetInputToInputWeights({-0.45018822, -0.02338299, -0.0870589,
+                               -0.34550029, 0.04266912, -0.15680569,
+                               -0.34856534, 0.43890524});
+
+  lstm.SetInputToCellWeights({-0.50013041, 0.1370284, 0.11810488, 0.2013163,
+                              -0.20583314, 0.44344562, 0.22077113,
+                              -0.29909778});
+
+  lstm.SetInputToForgetWeights({0.09701663, 0.20334584, -0.50592935,
+                                -0.31343272, -0.40032279, 0.44781327,
+                                0.01387155, -0.35593212});
+
+  lstm.SetInputToOutputWeights({-0.25065863, -0.28290087, 0.04613829,
+                                0.40525138, 0.44272184, 0.03897077, -0.1556896,
+                                0.19487578});
+
+  lstm.SetInputGateBias({0., 0., 0., 0.});
+
+  lstm.SetCellBias({0., 0., 0., 0.});
+
+  lstm.SetForgetGateBias({1., 1., 1., 1.});
+
+  lstm.SetOutputGateBias({0., 0., 0., 0.});
+
+  lstm.SetRecurrentToInputWeights(
+      {-0.0063535, -0.2042388, 0.31454784, -0.35746509, 0.28902304, 0.08183324,
+       -0.16555229, 0.02286911, -0.13566875, 0.03034258, 0.48091322,
+       -0.12528998, 0.24077177, -0.51332325, -0.33502164, 0.10629296});
+
+  lstm.SetRecurrentToCellWeights(
+      {-0.3407414, 0.24443203, -0.2078532, 0.26320225, 0.05695659, -0.00123841,
+       -0.4744786, -0.35869038, -0.06418842, -0.13502428, -0.501764, 0.22830659,
+       -0.46367589, 0.26016325, -0.03894562, -0.16368064});
+
+  lstm.SetRecurrentToForgetWeights(
+      {-0.48684245, -0.06655136, 0.42224967, 0.2112639, 0.27654213, 0.20864892,
+       -0.07646349, 0.45877004, 0.00141793, -0.14609534, 0.36447752, 0.09196436,
+       0.28053468, 0.01560611, -0.20127171, -0.01140004});
+
+  lstm.SetRecurrentToOutputWeights(
+      {0.43385774, -0.17194885, 0.2718237, 0.09215671, 0.24107647, -0.39835793,
+       0.18212086, 0.01301402, 0.48572797, -0.50656658, 0.20047462, -0.20607421,
+       -0.51818722, -0.15390486, 0.0468148, 0.39922136});
+
+  // Input should have n_input * sequence_length many values.
+  static float lstm_input[] = {2., 3., 3., 4., 1., 1.};
+  static float lstm_fw_golden_output[] = {
+      -0.02973187, 0.1229473,  0.20885126, -0.15358765,
+      -0.03716109, 0.12507336, 0.41193449, -0.20860538,
+      -0.15053082, 0.09120187, 0.24278517, -0.12222792};
+  static float lstm_bw_golden_output[] = {
+      -0.0806187, 0.139077, 0.400476,   -0.197842, -0.0332076, 0.123838,
+      0.309777,   -0.17621, -0.0490733, 0.0739237, 0.067706,   -0.0208124};
+
+  float* batch0_start = lstm_input;
+  float* batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length();
+
+  lstm.SetInput(0, batch0_start, batch0_end);
+  // Aux input and input are the same, so we should observe the same outputs
+  // as there's no aux input.
+  lstm.SetAuxInput(0, batch0_start, batch0_end);
+
+  lstm.Invoke();
+
+  float* fw_golden_start = lstm_fw_golden_output;
+  float* fw_golden_end =
+      fw_golden_start + lstm.num_fw_outputs() * lstm.sequence_length();
+  std::vector<float> fw_expected;
+  fw_expected.insert(fw_expected.end(), fw_golden_start, fw_golden_end);
+  EXPECT_THAT(lstm.GetFwOutput(),
+              ElementsAreArray(
+                  ArrayFloatNear(fw_expected, quantize_weights ? 1e-2 : 1e-5)));
+
+  float* bw_golden_start = lstm_bw_golden_output;
+  float* bw_golden_end =
+      bw_golden_start + lstm.num_bw_outputs() * lstm.sequence_length();
+  std::vector<float> bw_expected;
+  bw_expected.insert(bw_expected.end(), bw_golden_start, bw_golden_end);
+  EXPECT_THAT(lstm.GetBwOutput(),
+              ElementsAreArray(
+                  ArrayFloatNear(bw_expected, quantize_weights ? 1e-2 : 1e-5)));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/dequantize_test.cc b/tensorflow/lite/kernels/dequantize_test.cc
index bb5f1e74a8b0174209043e14af9c35db32bf14b5..be7caa31892a9dbb41eef2f88479c9f0051e2339 100644
--- a/tensorflow/lite/kernels/dequantize_test.cc
+++ b/tensorflow/lite/kernels/dequantize_test.cc
@@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstdint>
+
 #include <gtest/gtest.h>
 #include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/model.h"
@@ -58,7 +61,7 @@ TEST(DequantizeOpTest, UINT8) {
   // [-63.5, 64] -> scale=0.5 zero_point=127 for UINT8
   DequantizeOpModel m(TensorType_UINT8, {2, 5}, 0.5, 127);
 
-  m.SetInput<uint8>({0, 1, 2, 3, 4, 251, 252, 253, 254, 255});
+  m.SetInput<uint8_t>({0, 1, 2, 3, 4, 251, 252, 253, 254, 255});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(),
               ElementsAreArray(ArrayFloatNear(
@@ -69,7 +72,7 @@ TEST(DequantizeOpTest, INT8) {
   // [-63.5, 64] -> scale=0.5, zero_point=1 for INT8
   DequantizeOpModel m(TensorType_INT8, {2, 5}, 0.5, -1);
 
-  m.SetInput<int8>({-128, -127, -126, -125, -124, 123, 124, 125, 126, 127});
+  m.SetInput<int8_t>({-128, -127, -126, -125, -124, 123, 124, 125, 126, 127});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(),
               ElementsAreArray(ArrayFloatNear(
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 5eb8a353b79bc7e4cb8e990b216c712cd7b99434..b734b2d6cc30bb84eaa424ffed71747136f57c4c 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
@@ -545,16 +546,21 @@ cc_library(
     name = "test_util",
     srcs = ["test_util.cc"],
     hdrs = ["test_util.h"],
+    linkopts = select({
+        "//tensorflow:windows": [],
+        "//conditions:default": ["-lm"],
+    }),
     deps = [
         ":types",
         "//tensorflow/lite:string",
     ],
 )
 
-cc_test(
+# TODO(b/122597976): Eliminate TF dependency from lite/kernels:test_util,
+# in turn eliminating the need to use tf_cc_test for any dependent tests.
+tf_cc_test(
     name = "tensor_utils_test",
     srcs = ["tensor_utils_test.cc"],
-    copts = NEON_FLAGS_IF_APPLICABLE,
     linkopts = select({
         "//tensorflow:android": [
             "-fPIE -pie",
@@ -637,6 +643,7 @@ cc_test(
     srcs = [
         "softmax_quantized_test.cc",
     ],
+    shard_count = 3,
     deps = [
         ":optimized_base",
         ":quantization_util",
@@ -653,7 +660,10 @@ cc_test(
     srcs = [
         "logsoftmax_quantized_test.cc",
     ],
+    shard_count = 3,
     tags = [
+        # TODO(b/122242739): Reenable after fixing the flakiness?
+        "nomac",
         "tflite_not_portable",
     ],
     deps = [
diff --git a/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc b/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc
index 889a726f3a915fb592511d34c036b9726542fee9..945300dad1653257db69c3440f6db0589e0c1a7b 100644
--- a/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc
@@ -225,7 +225,7 @@ bool TryOneSkyscraperLogSoftmax(bool small_depth) {
 }
 
 TEST(TestQuantizedLogSoftmax, UniformLogSoftmaxTests) {
-  const int kTestsToRun = 1000;
+  const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
     while (!TryOneUniformLogSoftmax()) {
     }
@@ -233,7 +233,7 @@ TEST(TestQuantizedLogSoftmax, UniformLogSoftmaxTests) {
 }
 
 TEST(TestQuantizedLogSoftmax, SkyscraperLogSoftmaxTests) {
-  const int kTestsToRun = 1000;
+  const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
     while (!TryOneSkyscraperLogSoftmax(false)) {
     }
@@ -241,7 +241,7 @@ TEST(TestQuantizedLogSoftmax, SkyscraperLogSoftmaxTests) {
 }
 
 TEST(TestQuantizedLogSoftmax, SmallSkyscraperLogSoftmaxTests) {
-  const int kTestsToRun = 1000;
+  const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
     while (!TryOneSkyscraperLogSoftmax(true)) {
     }
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
index cf40ebb241d013a4853854f57fd55ebbce8a1752..cf1bda2661b13bb0292ea76946fdff060aa91d43 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -144,7 +144,7 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
         // registers).
         int16x8_t prod_16x8 =
             vmull_s8(vget_low_s8(s1_8x16), vget_low_s8(s2_8x16));
-        // Multiply the high bits (i.e. the lower 8 8bit numbers in the
+        // Multiply the high bits (i.e. the higher 8 8bit numbers in the
         // registers), and accumulate with the result of the low bits product.
         // The assumption here is that overflow will not happen as we quantize
         // our values to be in the range [-127, 127]. As such the sum of the 2
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index 7bc6b324c52efcfdd78ac0e5d95af8cb67d40a9c..ac68757b0605b03eee5a70c09fb45f8417b364a9 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -5387,9 +5387,6 @@ inline void ResizeBilinearGenericSmallChannel(
     int32 output_height, int32 output_width, float height_scale,
     float width_scale, const RuntimeShape& input_shape, const T* input_data,
     const RuntimeShape& output_shape, T* output_data) {
-  memset(output_data, 0,
-         batches * output_height * output_width * depth * sizeof(T));
-
   T* output_ptr = &output_data[0];
   for (int b = 0; b < batches; ++b) {
     for (int y = 0; y < output_height; ++y) {
@@ -5398,7 +5395,7 @@ inline void ResizeBilinearGenericSmallChannel(
       int32 y1 = std::min(y0 + 1, input_height - 1);
       for (int x = 0; x < output_width; ++x) {
         float input_x = x * width_scale;
-        int32 x0 = static_cast<int32>(input_x);
+        int32 x0 = static_cast<int32>(std::floor((input_x)));
         int32 x1 = std::min(x0 + 1, input_width - 1);
 
         int32 input_offset[4] = {Offset(input_shape, b, y0, x0, 0),
@@ -6082,7 +6079,27 @@ inline void TransposeConv(
     const float* filter_data, const RuntimeShape& output_shape,
     float* output_data, const RuntimeShape& im2col_shape, float* im2col_data) {
   gemmlowp::ScopedProfilingLabel label("TransposeConv");
-
+  // The complexity of the reference implementation is input.flat_size() *
+  // filter.flat_size() / in_channel.
+  //
+  // While the complexity of im2col->gemm
+  // implmentation is batch * output_height * output_width *
+  // (filter.flat_size() / out_channel)^2 * out_channel.
+  //
+  // so if input.flat_size() * out_channel^2 is much smaller than
+  // output.flat_size() * filter.size() * in_channel we should fall back to the
+  // reference implementation.
+  //
+  // TODO(b/122331966): optimize the intuitive implementation.
+  const int out_channel = output_shape.Dims(3);
+  const int in_channel = input_shape.Dims(3);
+  if ((input_shape.FlatSize() * out_channel * out_channel * 4) <
+      (filter_shape.FlatSize() * output_shape.FlatSize() * in_channel)) {
+    reference_ops::TransposeConv(params, input_shape, input_data, filter_shape,
+                                 filter_data, output_shape, output_data,
+                                 im2col_shape, im2col_data);
+    return;
+  }
   // Note we could use transposed weights with forward conv for unstrided
   // cases. But we are already getting good performance with this code as-is.
   TFLITE_DCHECK(im2col_data);
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index d692063a968dab654eaf46b9956ddcd338b64410..1acf0caad0db8481965fcba0bc1fafb41bd23f47 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -101,7 +101,6 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
       __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
                          3 /* temporal locality */);
 #endif
-      // For every block of 16 8-bit elements (128-bit register) from each row.
       for (col = 0; col < m_cols; ++col, ++row_ptr) {
         dotprod += (*row_ptr) * (vectors[col]);
       }  // for col
diff --git a/tensorflow/lite/kernels/internal/resize_bilinear_test.cc b/tensorflow/lite/kernels/internal/resize_bilinear_test.cc
index 1c5ac1992f0f649ca47e2a5bc81ea332abc46bf5..4a19b69a7c9dfc70192d446f922052606c516365 100644
--- a/tensorflow/lite/kernels/internal/resize_bilinear_test.cc
+++ b/tensorflow/lite/kernels/internal/resize_bilinear_test.cc
@@ -76,6 +76,7 @@ void TestOneResizeBilinear(int batch, int depth, int input_width,
 }
 
 TEST(ResizeBilinear, TestResizeBilinear8Bit) {
+  RandomEngine().seed(38291);
   const int kTestsToRun = 100 * 1000;
   for (int i = 0; i < kTestsToRun; i++) {
     const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
@@ -91,6 +92,7 @@ TEST(ResizeBilinear, TestResizeBilinear8Bit) {
 }
 
 TEST(ResizeBilinear2x2, TestResizeBilinear8Bit) {
+  RandomEngine().seed(38291);
   const int kTestsToRun = 100 * 1000;
   for (int i = 0; i < kTestsToRun; i++) {
     const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
@@ -106,6 +108,7 @@ TEST(ResizeBilinear2x2, TestResizeBilinear8Bit) {
 }
 
 TEST(ResizeBilinear, TestResizeBilinear) {
+  RandomEngine().seed(38291);
   const int kTestsToRun = 100 * 1000;
   for (int i = 0; i < kTestsToRun; i++) {
     const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
@@ -121,6 +124,7 @@ TEST(ResizeBilinear, TestResizeBilinear) {
 }
 
 TEST(ResizeBilinear2x2, TestResizeBilinear) {
+  RandomEngine().seed(38291);
   const int kTestsToRun = 100 * 1000;
   for (int i = 0; i < kTestsToRun; i++) {
     const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
diff --git a/tensorflow/lite/kernels/internal/softmax_quantized_test.cc b/tensorflow/lite/kernels/internal/softmax_quantized_test.cc
index 743ce0355c96fd2766fd2315299c2419703f11b7..8ac62d9af787b2846a0f2031a3c9bcd9f2ab44d7 100644
--- a/tensorflow/lite/kernels/internal/softmax_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/softmax_quantized_test.cc
@@ -210,7 +210,7 @@ bool TryOneSkyscraperSoftmax(bool small_depth) {
 }
 
 TEST(TestQuantizedSoftmax, UniformSoftmaxTests) {
-  const int kTestsToRun = 1000;
+  const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
     while (!TryOneUniformSoftmax()) {
     }
@@ -218,7 +218,7 @@ TEST(TestQuantizedSoftmax, UniformSoftmaxTests) {
 }
 
 TEST(TestQuantizedSoftmax, SkyscraperSoftmaxTests) {
-  const int kTestsToRun = 1000;
+  const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
     while (!TryOneSkyscraperSoftmax(false)) {
     }
@@ -226,7 +226,7 @@ TEST(TestQuantizedSoftmax, SkyscraperSoftmaxTests) {
 }
 
 TEST(TestQuantizedSoftmax, SmallSkyscraperSoftmaxTests) {
-  const int kTestsToRun = 1000;
+  const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
     while (!TryOneSkyscraperSoftmax(true)) {
     }
diff --git a/tensorflow/lite/kernels/layer_norm_lstm.cc b/tensorflow/lite/kernels/layer_norm_lstm.cc
index 49e8a53c829a0c4a8ae355f8e7a6b97e3bbb81e1..ce0c21dfcba770b72f144c272d7ab12b2e77e399 100644
--- a/tensorflow/lite/kernels/layer_norm_lstm.cc
+++ b/tensorflow/lite/kernels/layer_norm_lstm.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// DEPRECATED: Tensorflow Lite has implemented layer norm lstm as builtin Op and
+// the implementation of layer norm lstm as custom Op in this file is
+// deprecated. It is only kept for backward compatibility.
+//
 // Layer Normalization LSTM op that applies normalization by mean and standard
 // deviation to the activation of the LSTM layers. Please see
 // https://arxiv.org/abs/1607.06450 for details.
diff --git a/tensorflow/lite/kernels/layer_norm_lstm_test.cc b/tensorflow/lite/kernels/layer_norm_lstm_test.cc
index 1c13cee1c3f66ed2a3459cd2bcc32211c3b1f00e..5aed818f2407a96acb8893654971fc5bb91a81ed 100644
--- a/tensorflow/lite/kernels/layer_norm_lstm_test.cc
+++ b/tensorflow/lite/kernels/layer_norm_lstm_test.cc
@@ -133,85 +133,87 @@ class LayerNormLSTMOpModel : public SingleOpModel {
     BuildInterpreter(input_shapes);
   }
 
-  void SetInputToInputWeights(std::vector<float> f) {
+  void SetInputToInputWeights(const std::vector<float>& f) {
     PopulateTensor(input_to_input_weights_, f);
   }
 
-  void SetInputToForgetWeights(std::vector<float> f) {
+  void SetInputToForgetWeights(const std::vector<float>& f) {
     PopulateTensor(input_to_forget_weights_, f);
   }
 
-  void SetInputToCellWeights(std::vector<float> f) {
+  void SetInputToCellWeights(const std::vector<float>& f) {
     PopulateTensor(input_to_cell_weights_, f);
   }
 
-  void SetInputToOutputWeights(std::vector<float> f) {
+  void SetInputToOutputWeights(const std::vector<float>& f) {
     PopulateTensor(input_to_output_weights_, f);
   }
 
-  void SetRecurrentToInputWeights(std::vector<float> f) {
+  void SetRecurrentToInputWeights(const std::vector<float>& f) {
     PopulateTensor(recurrent_to_input_weights_, f);
   }
 
-  void SetRecurrentToForgetWeights(std::vector<float> f) {
+  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
     PopulateTensor(recurrent_to_forget_weights_, f);
   }
 
-  void SetRecurrentToCellWeights(std::vector<float> f) {
+  void SetRecurrentToCellWeights(const std::vector<float>& f) {
     PopulateTensor(recurrent_to_cell_weights_, f);
   }
 
-  void SetRecurrentToOutputWeights(std::vector<float> f) {
+  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
     PopulateTensor(recurrent_to_output_weights_, f);
   }
 
-  void SetCellToInputWeights(std::vector<float> f) {
+  void SetCellToInputWeights(const std::vector<float>& f) {
     PopulateTensor(cell_to_input_weights_, f);
   }
 
-  void SetCellToForgetWeights(std::vector<float> f) {
+  void SetCellToForgetWeights(const std::vector<float>& f) {
     PopulateTensor(cell_to_forget_weights_, f);
   }
 
-  void SetCellToOutputWeights(std::vector<float> f) {
+  void SetCellToOutputWeights(const std::vector<float>& f) {
     PopulateTensor(cell_to_output_weights_, f);
   }
 
-  void SetInputLayerNormWeights(std::vector<float> f) {
+  void SetInputLayerNormWeights(const std::vector<float>& f) {
     PopulateTensor(input_layer_norm_weights_, f);
   }
 
-  void SetForgetLayerNormWeights(std::vector<float> f) {
+  void SetForgetLayerNormWeights(const std::vector<float>& f) {
     PopulateTensor(forget_layer_norm_weights_, f);
   }
 
-  void SetCellLayerNormWeights(std::vector<float> f) {
+  void SetCellLayerNormWeights(const std::vector<float>& f) {
     PopulateTensor(cell_layer_norm_weights_, f);
   }
 
-  void SetOutputLayerNormWeights(std::vector<float> f) {
+  void SetOutputLayerNormWeights(const std::vector<float>& f) {
     PopulateTensor(output_layer_norm_weights_, f);
   }
 
-  void SetInputGateBias(std::vector<float> f) {
+  void SetInputGateBias(const std::vector<float>& f) {
     PopulateTensor(input_gate_bias_, f);
   }
 
-  void SetForgetGateBias(std::vector<float> f) {
+  void SetForgetGateBias(const std::vector<float>& f) {
     PopulateTensor(forget_gate_bias_, f);
   }
 
-  void SetCellBias(std::vector<float> f) { PopulateTensor(cell_bias_, f); }
+  void SetCellBias(const std::vector<float>& f) {
+    PopulateTensor(cell_bias_, f);
+  }
 
-  void SetOutputGateBias(std::vector<float> f) {
+  void SetOutputGateBias(const std::vector<float>& f) {
     PopulateTensor(output_gate_bias_, f);
   }
 
-  void SetProjectionWeights(std::vector<float> f) {
+  void SetProjectionWeights(const std::vector<float>& f) {
     PopulateTensor(projection_weights_, f);
   }
 
-  void SetProjectionBias(std::vector<float> f) {
+  void SetProjectionBias(const std::vector<float>& f) {
     PopulateTensor(projection_bias_, f);
   }
 
@@ -280,67 +282,67 @@ class HybridLayerNormLSTMOpModel : public LayerNormLSTMOpModel {
                              use_projection_bias, cell_clip, proj_clip,
                              input_shapes, TensorType_UINT8) {}
 
-  void SetInputToInputWeights(std::vector<float> f) {
+  void SetInputToInputWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(input_to_input_weights_, f);
   }
 
-  void SetInputToForgetWeights(std::vector<float> f) {
+  void SetInputToForgetWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(input_to_forget_weights_, f);
   }
 
-  void SetInputToCellWeights(std::vector<float> f) {
+  void SetInputToCellWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(input_to_cell_weights_, f);
   }
 
-  void SetInputToOutputWeights(std::vector<float> f) {
+  void SetInputToOutputWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(input_to_output_weights_, f);
   }
 
-  void SetRecurrentToInputWeights(std::vector<float> f) {
+  void SetRecurrentToInputWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(recurrent_to_input_weights_, f);
   }
 
-  void SetRecurrentToForgetWeights(std::vector<float> f) {
+  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(recurrent_to_forget_weights_, f);
   }
 
-  void SetRecurrentToCellWeights(std::vector<float> f) {
+  void SetRecurrentToCellWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(recurrent_to_cell_weights_, f);
   }
 
-  void SetRecurrentToOutputWeights(std::vector<float> f) {
+  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(recurrent_to_output_weights_, f);
   }
 
-  void SetCellToInputWeights(std::vector<float> f) {
+  void SetCellToInputWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(cell_to_input_weights_, f);
   }
 
-  void SetCellToForgetWeights(std::vector<float> f) {
+  void SetCellToForgetWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(cell_to_forget_weights_, f);
   }
 
-  void SetCellToOutputWeights(std::vector<float> f) {
+  void SetCellToOutputWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(cell_to_output_weights_, f);
   }
 
-  void SetInputLayerNormWeights(std::vector<float> f) {
+  void SetInputLayerNormWeights(const std::vector<float>& f) {
     PopulateTensor(input_layer_norm_weights_, f);
   }
 
-  void SetForgetLayerNormWeights(std::vector<float> f) {
+  void SetForgetLayerNormWeights(const std::vector<float>& f) {
     PopulateTensor(forget_layer_norm_weights_, f);
   }
 
-  void SetCellLayerNormWeights(std::vector<float> f) {
+  void SetCellLayerNormWeights(const std::vector<float>& f) {
     PopulateTensor(cell_layer_norm_weights_, f);
   }
 
-  void SetOutputLayerNormWeights(std::vector<float> f) {
+  void SetOutputLayerNormWeights(const std::vector<float>& f) {
     PopulateTensor(output_layer_norm_weights_, f);
   }
 
-  void SetProjectionWeights(std::vector<float> f) {
+  void SetProjectionWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(projection_weights_, f);
   }
 };
diff --git a/tensorflow/lite/kernels/lstm_test.cc b/tensorflow/lite/kernels/lstm_test.cc
index 09ce440276cb09a9b45812fb197501f963166858..40ee94888136207eddcb38577377027c718a0a58 100644
--- a/tensorflow/lite/kernels/lstm_test.cc
+++ b/tensorflow/lite/kernels/lstm_test.cc
@@ -129,85 +129,87 @@ class LSTMOpModel : public SingleOpModel {
     BuildInterpreter(input_shapes);
   }
 
-  void SetInputToInputWeights(std::vector<float> f) {
+  void SetInputToInputWeights(const std::vector<float>& f) {
     PopulateTensor(input_to_input_weights_, f);
   }
 
-  void SetInputToForgetWeights(std::vector<float> f) {
+  void SetInputToForgetWeights(const std::vector<float>& f) {
     PopulateTensor(input_to_forget_weights_, f);
   }
 
-  void SetInputToCellWeights(std::vector<float> f) {
+  void SetInputToCellWeights(const std::vector<float>& f) {
     PopulateTensor(input_to_cell_weights_, f);
   }
 
-  void SetInputToOutputWeights(std::vector<float> f) {
+  void SetInputToOutputWeights(const std::vector<float>& f) {
     PopulateTensor(input_to_output_weights_, f);
   }
 
-  void SetRecurrentToInputWeights(std::vector<float> f) {
+  void SetRecurrentToInputWeights(const std::vector<float>& f) {
     PopulateTensor(recurrent_to_input_weights_, f);
   }
 
-  void SetRecurrentToForgetWeights(std::vector<float> f) {
+  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
     PopulateTensor(recurrent_to_forget_weights_, f);
   }
 
-  void SetRecurrentToCellWeights(std::vector<float> f) {
+  void SetRecurrentToCellWeights(const std::vector<float>& f) {
     PopulateTensor(recurrent_to_cell_weights_, f);
   }
 
-  void SetRecurrentToOutputWeights(std::vector<float> f) {
+  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
     PopulateTensor(recurrent_to_output_weights_, f);
   }
 
-  void SetCellToInputWeights(std::vector<float> f) {
+  void SetCellToInputWeights(const std::vector<float>& f) {
     PopulateTensor(cell_to_input_weights_, f);
   }
 
-  void SetCellToForgetWeights(std::vector<float> f) {
+  void SetCellToForgetWeights(const std::vector<float>& f) {
     PopulateTensor(cell_to_forget_weights_, f);
   }
 
-  void SetCellToOutputWeights(std::vector<float> f) {
+  void SetCellToOutputWeights(const std::vector<float>& f) {
     PopulateTensor(cell_to_output_weights_, f);
   }
 
-  void SetInputLayerNormCoefficients(std::vector<float> f) {
+  void SetInputLayerNormCoefficients(const std::vector<float>& f) {
     PopulateTensor(input_layer_norm_coefficients_, f);
   }
 
-  void SetForgetLayerNormCoefficients(std::vector<float> f) {
+  void SetForgetLayerNormCoefficients(const std::vector<float>& f) {
     PopulateTensor(forget_layer_norm_coefficients_, f);
   }
 
-  void SetCellLayerNormCoefficients(std::vector<float> f) {
+  void SetCellLayerNormCoefficients(const std::vector<float>& f) {
     PopulateTensor(cell_layer_norm_coefficients_, f);
   }
 
-  void SetOutputLayerNormCoefficients(std::vector<float> f) {
+  void SetOutputLayerNormCoefficients(const std::vector<float>& f) {
     PopulateTensor(output_layer_norm_coefficients_, f);
   }
 
-  void SetInputGateBias(std::vector<float> f) {
+  void SetInputGateBias(const std::vector<float>& f) {
     PopulateTensor(input_gate_bias_, f);
   }
 
-  void SetForgetGateBias(std::vector<float> f) {
+  void SetForgetGateBias(const std::vector<float>& f) {
     PopulateTensor(forget_gate_bias_, f);
   }
 
-  void SetCellBias(std::vector<float> f) { PopulateTensor(cell_bias_, f); }
+  void SetCellBias(const std::vector<float>& f) {
+    PopulateTensor(cell_bias_, f);
+  }
 
-  void SetOutputGateBias(std::vector<float> f) {
+  void SetOutputGateBias(const std::vector<float>& f) {
     PopulateTensor(output_gate_bias_, f);
   }
 
-  void SetProjectionWeights(std::vector<float> f) {
+  void SetProjectionWeights(const std::vector<float>& f) {
     PopulateTensor(projection_weights_, f);
   }
 
-  void SetProjectionBias(std::vector<float> f) {
+  void SetProjectionBias(const std::vector<float>& f) {
     PopulateTensor(projection_bias_, f);
   }
 
@@ -280,7 +282,7 @@ class HybridLSTMOpModel : public LSTMOpModel {
 
   TensorType tensor_type_;
 
-  void SetWeights(int weights_idx, std::vector<float> f) {
+  void SetWeights(int weights_idx, const std::vector<float>& f) {
     if (tensor_type_ == TensorType_UINT8) {
       SymmetricQuantizeAndPopulate(weights_idx, f);
     } else {
@@ -288,51 +290,51 @@ class HybridLSTMOpModel : public LSTMOpModel {
     }
   }
 
-  void SetInputToInputWeights(std::vector<float> f) {
+  void SetInputToInputWeights(const std::vector<float>& f) {
     SetWeights(input_to_input_weights_, f);
   }
 
-  void SetInputToForgetWeights(std::vector<float> f) {
+  void SetInputToForgetWeights(const std::vector<float>& f) {
     SetWeights(input_to_forget_weights_, f);
   }
 
-  void SetInputToCellWeights(std::vector<float> f) {
+  void SetInputToCellWeights(const std::vector<float>& f) {
     SetWeights(input_to_cell_weights_, f);
   }
 
-  void SetInputToOutputWeights(std::vector<float> f) {
+  void SetInputToOutputWeights(const std::vector<float>& f) {
     SetWeights(input_to_output_weights_, f);
   }
 
-  void SetRecurrentToInputWeights(std::vector<float> f) {
+  void SetRecurrentToInputWeights(const std::vector<float>& f) {
     SetWeights(recurrent_to_input_weights_, f);
   }
 
-  void SetRecurrentToForgetWeights(std::vector<float> f) {
+  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
     SetWeights(recurrent_to_forget_weights_, f);
   }
 
-  void SetRecurrentToCellWeights(std::vector<float> f) {
+  void SetRecurrentToCellWeights(const std::vector<float>& f) {
     SetWeights(recurrent_to_cell_weights_, f);
   }
 
-  void SetRecurrentToOutputWeights(std::vector<float> f) {
+  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
     SetWeights(recurrent_to_output_weights_, f);
   }
 
-  void SetCellToInputWeights(std::vector<float> f) {
+  void SetCellToInputWeights(const std::vector<float>& f) {
     SetWeights(cell_to_input_weights_, f);
   }
 
-  void SetCellToForgetWeights(std::vector<float> f) {
+  void SetCellToForgetWeights(const std::vector<float>& f) {
     SetWeights(cell_to_forget_weights_, f);
   }
 
-  void SetCellToOutputWeights(std::vector<float> f) {
+  void SetCellToOutputWeights(const std::vector<float>& f) {
     SetWeights(cell_to_output_weights_, f);
   }
 
-  void SetProjectionWeights(std::vector<float> f) {
+  void SetProjectionWeights(const std::vector<float>& f) {
     SetWeights(projection_weights_, f);
   }
 };
@@ -1658,7 +1660,7 @@ class HybridLayerNormLSTMOpModel : public LayerNormLSTMOpModel {
 
   TensorType tensor_type_;
 
-  void SetWeights(int weights_idx, std::vector<float> f) {
+  void SetWeights(int weights_idx, const std::vector<float>& f) {
     if (tensor_type_ == TensorType_UINT8) {
       SymmetricQuantizeAndPopulate(weights_idx, f);
     } else {
@@ -1666,67 +1668,67 @@ class HybridLayerNormLSTMOpModel : public LayerNormLSTMOpModel {
     }
   }
 
-  void SetInputToInputWeights(std::vector<float> f) {
+  void SetInputToInputWeights(const std::vector<float>& f) {
     SetWeights(input_to_input_weights_, f);
   }
 
-  void SetInputToForgetWeights(std::vector<float> f) {
+  void SetInputToForgetWeights(const std::vector<float>& f) {
     SetWeights(input_to_forget_weights_, f);
   }
 
-  void SetInputToCellWeights(std::vector<float> f) {
+  void SetInputToCellWeights(const std::vector<float>& f) {
     SetWeights(input_to_cell_weights_, f);
   }
 
-  void SetInputToOutputWeights(std::vector<float> f) {
+  void SetInputToOutputWeights(const std::vector<float>& f) {
     SetWeights(input_to_output_weights_, f);
   }
 
-  void SetRecurrentToInputWeights(std::vector<float> f) {
+  void SetRecurrentToInputWeights(const std::vector<float>& f) {
     SetWeights(recurrent_to_input_weights_, f);
   }
 
-  void SetRecurrentToForgetWeights(std::vector<float> f) {
+  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
     SetWeights(recurrent_to_forget_weights_, f);
   }
 
-  void SetRecurrentToCellWeights(std::vector<float> f) {
+  void SetRecurrentToCellWeights(const std::vector<float>& f) {
     SetWeights(recurrent_to_cell_weights_, f);
   }
 
-  void SetRecurrentToOutputWeights(std::vector<float> f) {
+  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
     SetWeights(recurrent_to_output_weights_, f);
   }
 
-  void SetCellToInputWeights(std::vector<float> f) {
+  void SetCellToInputWeights(const std::vector<float>& f) {
     SetWeights(cell_to_input_weights_, f);
   }
 
-  void SetCellToForgetWeights(std::vector<float> f) {
+  void SetCellToForgetWeights(const std::vector<float>& f) {
     SetWeights(cell_to_forget_weights_, f);
   }
 
-  void SetCellToOutputWeights(std::vector<float> f) {
+  void SetCellToOutputWeights(const std::vector<float>& f) {
     SetWeights(cell_to_output_weights_, f);
   }
 
-  void SetInputLayerNormCoefficients(std::vector<float> f) {
+  void SetInputLayerNormCoefficients(const std::vector<float>& f) {
     PopulateTensor(input_layer_norm_coefficients_, f);
   }
 
-  void SetForgetLayerNormCoefficients(std::vector<float> f) {
+  void SetForgetLayerNormCoefficients(const std::vector<float>& f) {
     PopulateTensor(forget_layer_norm_coefficients_, f);
   }
 
-  void SetCellLayerNormCoefficients(std::vector<float> f) {
+  void SetCellLayerNormCoefficients(const std::vector<float>& f) {
     PopulateTensor(cell_layer_norm_coefficients_, f);
   }
 
-  void SetOutputLayerNormCoefficients(std::vector<float> f) {
+  void SetOutputLayerNormCoefficients(const std::vector<float>& f) {
     PopulateTensor(output_layer_norm_coefficients_, f);
   }
 
-  void SetProjectionWeights(std::vector<float> f) {
+  void SetProjectionWeights(const std::vector<float>& f) {
     SetWeights(projection_weights_, f);
   }
 };
diff --git a/tensorflow/lite/kernels/pad_test.cc b/tensorflow/lite/kernels/pad_test.cc
index 415a285c707e6aa7a5a2029822cdf54d57692839..3caa4065dcbadd699ee9e61b8e97a42281d32309 100644
--- a/tensorflow/lite/kernels/pad_test.cc
+++ b/tensorflow/lite/kernels/pad_test.cc
@@ -175,6 +175,7 @@ class PadOpDynamicModel : public PadOpModel<float> {
   }
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(PadOpTest, TooManyDimensions) {
   EXPECT_DEATH(
       PadOpConstModel({TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2},
@@ -195,6 +196,7 @@ TEST(PadOpTest, InvalidPadValue) {
                       {0, 0, 1, -1, 2, -1, 0, 0}, {TensorType_FLOAT32}),
       "Pad value has to be greater than equal to 0.");
 }
+#endif
 
 TEST(PadOpTest, SimpleConstTest) {
   // Padding is represented as four 2-D lists representing above padding and
@@ -306,6 +308,7 @@ class QuantizedPadOpTest : public ::testing::Test {
   }
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST_F(QuantizedPadOpTest, ZeroNotInQuantizationRange) {
   // The test_util and actual quantization code currently ensure that the range
   // must include zero, but if that ever changes, this test will catch it.
@@ -314,6 +317,7 @@ TEST_F(QuantizedPadOpTest, ZeroNotInQuantizationRange) {
                                  {TensorType_UINT8, {}, 1.0, 2.0}),
                ".*Check failed: f_min <= 0.*");
 }
+#endif
 
 TEST_F(QuantizedPadOpTest, SimpleConstTest) {
   // Padding is represented as four 2-D lists representing above padding and
@@ -371,6 +375,7 @@ TEST_F(QuantizedPadOpTest, AdvancedDynamicTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
 }
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(PadV2OpTest, TooManyDimensions) {
   EXPECT_DEATH(PadV2OpConstModel<float>(
                    {TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2},
@@ -392,6 +397,7 @@ TEST(PadV2OpTest, InvalidPadValue) {
                                         {TensorType_FLOAT32}),
                "Pad value has to be greater than equal to 0.");
 }
+#endif
 
 TEST(PadV2OpTest, SimpleConstTest) {
   // Padding is represented as four 2-D lists representing above padding and
@@ -495,6 +501,7 @@ class QuantizedPadV2OpTest : public ::testing::Test {
   }
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST_F(QuantizedPadV2OpTest, ZeroNotInQuantizationRange) {
   // The test_util and actual quantization code currently ensure that the range
   // must include zero, but if that ever changes, this test will catch it.
@@ -504,6 +511,7 @@ TEST_F(QuantizedPadV2OpTest, ZeroNotInQuantizationRange) {
                                  {TensorType_UINT8, {}, 1.0, 2.0}),
       ".*Check failed: f_min <= 0.*");
 }
+#endif
 
 TEST_F(QuantizedPadV2OpTest, SimpleConstTest) {
   // Padding is represented as four 2-D lists representing above padding and
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index 47296fa10517ac390d88fe1ad6317d2e4f94af51..f17f39fc2b8bf104028cc307d0dbf40cc893585d 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -129,6 +129,7 @@ TfLiteRegistration* Register_LEAKY_RELU();
 TfLiteRegistration* Register_SQUARED_DIFFERENCE();
 TfLiteRegistration* Register_FILL();
 TfLiteRegistration* Register_MIRROR_PAD();
+TfLiteRegistration* Register_UNIQUE();
 
 TfLiteStatus UnsupportedTensorFlowOp(TfLiteContext* context, TfLiteNode* node) {
   context->ReportError(
@@ -284,6 +285,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_SQUARED_DIFFERENCE, Register_SQUARED_DIFFERENCE());
   AddBuiltin(BuiltinOperator_FILL, Register_FILL());
   AddBuiltin(BuiltinOperator_MIRROR_PAD, Register_MIRROR_PAD());
+  AddBuiltin(BuiltinOperator_UNIQUE, Register_UNIQUE());
 
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
diff --git a/tensorflow/lite/kernels/reshape_test.cc b/tensorflow/lite/kernels/reshape_test.cc
index 00bbbef57eccef67d043e85c02ebe80c3f9387ef..f98f3eb9aea70967fdc16120589b597c770cfced 100644
--- a/tensorflow/lite/kernels/reshape_test.cc
+++ b/tensorflow/lite/kernels/reshape_test.cc
@@ -123,6 +123,7 @@ class ReshapeOpModel : public SingleOpModel {
   int output_;
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST_P(ReshapeOpTest, MismatchedDimensions) {
   if (GetParam() == kAsTensor) {
     ReshapeOpModel<float> m({1, 2, 4, 1}, {2}, {2, 1}, GetParam());
@@ -133,12 +134,15 @@ TEST_P(ReshapeOpTest, MismatchedDimensions) {
                  "num_input_elements != num_output_elements");
   }
 }
+#endif
 
 TEST_P(ReshapeOpTest, TooManyDimensions) {
   if (GetParam() == kAsReshapeOption) {
+#ifdef GTEST_HAS_DEATH_TEST
     EXPECT_DEATH(ReshapeOpModel<float>({1, 1, 2, 1, 1, 1, 1, 1, 1}, {9},
                                        {1, 1, 1, 1, 1, 1, 1, 1, 2}, GetParam()),
                  "Found too many dimensions");
+#endif
   } else {
     ReshapeOpModel<float> m({1, 1, 2, 1, 1, 1, 1, 1, 1}, {9},
                             {1, 1, 1, 1, 1, 1, 1, 1, 2}, GetParam());
@@ -150,6 +154,7 @@ TEST_P(ReshapeOpTest, TooManyDimensions) {
   }
 }
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST_P(ReshapeOpTest, TooManySpecialDimensions) {
   if (GetParam() != kAsTensor) {
     EXPECT_DEATH(
@@ -160,6 +165,7 @@ TEST_P(ReshapeOpTest, TooManySpecialDimensions) {
     EXPECT_DEATH(m.Invoke(), "stretch_dim != -1");
   }
 }
+#endif
 
 // Create the model with a 2x2 shape. Processing still works because the new
 // shape ends up being hardcoded as a flat vector.
@@ -202,12 +208,16 @@ TEST_P(ReshapeOpTest, ScalarOutput) {
 // and output are scalars.
 TEST_P(ReshapeOpTest, LegacyScalarOutput) {
   if (GetParam() == kAsConstantTensor) {
+#ifdef GTEST_HAS_DEATH_TEST
     EXPECT_DEATH(ReshapeOpModel<float>({1}, {1}, {0}, GetParam()),
                  "num_input_elements != num_output_elements");
+#endif
   } else if (GetParam() == kAsTensor) {
+#ifdef GTEST_HAS_DEATH_TEST
     ReshapeOpModel<float> m({1}, {1}, {0}, GetParam());
     m.SetInput({3});
     EXPECT_DEATH(m.Invoke(), "num_input_elements != num_output_elements");
+#endif
   } else {
     ReshapeOpModel<float> m({1}, {1}, {0}, GetParam());
     m.SetInput({3});
diff --git a/tensorflow/lite/kernels/space_to_batch_nd_test.cc b/tensorflow/lite/kernels/space_to_batch_nd_test.cc
index 4d55ba56b71c5e0c44f0145981db56cbef6ec99a..c5d6e9a53062d97801b518f15305e2052f861e7c 100644
--- a/tensorflow/lite/kernels/space_to_batch_nd_test.cc
+++ b/tensorflow/lite/kernels/space_to_batch_nd_test.cc
@@ -106,12 +106,14 @@ class SpaceToBatchNDOpDynamicModel : public SpaceToBatchNDOpModel {
   }
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(SpaceToBatchNDOpTest, InvalidShapeTest) {
   EXPECT_DEATH(
       SpaceToBatchNDOpConstModel({TensorType_FLOAT32, {1, 3, 3, 1}}, {2, 2},
                                  {0, 0, 0, 0}, {TensorType_FLOAT32}),
       "Cannot allocate tensors");
 }
+#endif
 
 TEST(SpaceToBatchNDOpTest, SimpleConstTest) {
   SpaceToBatchNDOpConstModel m({TensorType_FLOAT32, {1, 4, 4, 1}}, {2, 2},
@@ -220,6 +222,7 @@ class QuantizedSpaceToBatchNDOpTest : public ::testing::Test {
   }
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST_F(QuantizedSpaceToBatchNDOpTest, ZeroNotInQuantizationRange) {
   // The test_util and actual quantization code currently ensure that the range
   // must include zero, but if that ever changes, this test will catch it.
@@ -228,6 +231,7 @@ TEST_F(QuantizedSpaceToBatchNDOpTest, ZeroNotInQuantizationRange) {
                    {0, 0, 1, 1, 1, 1, 0, 0}, {TensorType_UINT8, {}, 1.0, 2.0}),
                ".*Check failed: f_min <= 0.*");
 }
+#endif
 
 TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingConstTest) {
   SpaceToBatchNDOpConstModel m({TensorType_UINT8, {1, 5, 2, 1}, -1.0, 1.0},
diff --git a/tensorflow/lite/kernels/space_to_depth_test.cc b/tensorflow/lite/kernels/space_to_depth_test.cc
index 5744669b6d62af61a0b20e7723b78c72f6db952d..3fa8d86348ef899b9bd42c19f5b1510b4c4e33d3 100644
--- a/tensorflow/lite/kernels/space_to_depth_test.cc
+++ b/tensorflow/lite/kernels/space_to_depth_test.cc
@@ -50,10 +50,12 @@ class SpaceToDepthOpModel : public SingleOpModel {
   int output_;
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(SpaceToDepthOpModel, BadBlockSize) {
   EXPECT_DEATH(SpaceToDepthOpModel({TensorType_FLOAT32, {1, 2, 2, 1}}, 3),
                "Cannot allocate tensors");
 }
+#endif
 
 TEST(SpaceToDepthOpModel, Float32) {
   SpaceToDepthOpModel m({TensorType_FLOAT32, {1, 2, 2, 2}}, 2);
diff --git a/tensorflow/lite/kernels/sparse_output_fully_connected_test.cc b/tensorflow/lite/kernels/sparse_output_fully_connected_test.cc
index fb71cb7f7b321ac0a18641d113449b8422a61f9a..7d5fec192ce6b103c41f47ed60eb1283f72da45f 100644
--- a/tensorflow/lite/kernels/sparse_output_fully_connected_test.cc
+++ b/tensorflow/lite/kernels/sparse_output_fully_connected_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
 
@@ -61,7 +62,7 @@ class BaseSparseOutputFullyConnectedOpModel : public SingleOpModel {
     PopulateTensor(input_, data);
   }
 
-  void SetLookup(const std::vector<int32>& f) { PopulateTensor(lookup_, f); }
+  void SetLookup(const std::vector<int32_t>& f) { PopulateTensor(lookup_, f); }
 
   void SetBias(const std::vector<float>& f) { PopulateTensor(bias_, f); }
 
diff --git a/tensorflow/lite/kernels/strided_slice_test.cc b/tensorflow/lite/kernels/strided_slice_test.cc
index 122e01b99ecbed1255ea4b2d29e82b57f04be80c..34875bf0497a000da02f3d0212b042399046a492 100644
--- a/tensorflow/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/kernels/strided_slice_test.cc
@@ -72,6 +72,7 @@ class StridedSliceOpModel : public SingleOpModel {
   int output_;
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(StridedSliceOpTest, UnsupportedInputSize) {
   EXPECT_DEATH(
       StridedSliceOpModel<>({2, 2, 2, 2, 2}, {5}, {5}, {5}, 0, 0, 0, 0, 0),
@@ -84,6 +85,7 @@ TEST(StridedSliceOpTest, UnssupportedArgs) {
   EXPECT_DEATH(StridedSliceOpModel<>({3, 2}, {2}, {2}, {2}, 0, 0, 0, 1, 0),
                "new_axis_mask is not implemented yet.");
 }
+#endif
 
 TEST(StridedSliceOpTest, In1D) {
   StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
diff --git a/tensorflow/lite/kernels/svdf_test.cc b/tensorflow/lite/kernels/svdf_test.cc
index fc2bb492236753776eae79009ec9484e26690ed5..c420260bf51bd45944a7b77a81e20e56999c8fbb 100644
--- a/tensorflow/lite/kernels/svdf_test.cc
+++ b/tensorflow/lite/kernels/svdf_test.cc
@@ -209,7 +209,7 @@ class HybridSVDFOpModel : public BaseSVDFOpModel {
     tensor_type_ = tensor_type;
   }
 
-  void SetWeights(int weights_idx, std::vector<float> f) {
+  void SetWeights(int weights_idx, const std::vector<float>& f) {
     if (tensor_type_ == TensorType_UINT8) {
       SymmetricQuantizeAndPopulate(weights_idx, f);
     } else {
diff --git a/tensorflow/lite/kernels/transpose_test.cc b/tensorflow/lite/kernels/transpose_test.cc
index 3ebaf3ca27ffd285ef86a81b2e63409fde565ef1..93df2c81db8c17de7a36d155c7d26b826c859c99 100644
--- a/tensorflow/lite/kernels/transpose_test.cc
+++ b/tensorflow/lite/kernels/transpose_test.cc
@@ -184,6 +184,7 @@ class TransposeOpDynamicModel : public TransposeOpModel {
   }
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(TransposeTest, TestUnequalPermSize) {
   EXPECT_DEATH(TransposeOpConstModel({1, 3, 3, 1}, {2}, {2, 2}), "2 != 4");
 }
@@ -194,6 +195,7 @@ TEST(TransposeTest, TestPermOutOfBounds) {
   EXPECT_DEATH(TransposeOpConstModel({1, 3, 3, 1}, {4}, {0, 1, 2, 4}),
                "Transpose op permutations array is out of bounds.");
 }
+#endif
 
 TEST(TransposeTest, Test1DInputConstTensor) {
   TransposeOpConstModel m({3}, {1}, {0});
@@ -252,10 +254,12 @@ TEST(TransposeTest, Test3DInputDynamicTensor) {
                                 2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23}));
 }
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(TransposeTest, Test5DInputTensor) {
   EXPECT_DEATH(TransposeOpConstModel({1, 2, 3, 4, 5}, {5}, {0, 1, 2, 3, 4}),
                "Transpose op only supports 1D-4D input arrays.");
 }
+#endif
 
 TEST(TransposeTest, SimpleTestNoReorderConstTensor) {
   TransposeOpConstModel m({1, 2, 3, 1}, {4}, {0, 1, 2, 3});
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
index 274f907076f05ffbef03899dadc47bd6dd8ecc20..bc35d90773b522d22e4373c60ca83121ff7fd09e 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
@@ -252,7 +252,7 @@ class HybridUnidirectionalLSTMOpModel : public UnidirectionalLSTMOpModel {
     tensor_type_ = tensor_type;
   }
 
-  void SetWeights(int weights_idx, std::vector<float> f) {
+  void SetWeights(int weights_idx, const std::vector<float>& f) {
     if (tensor_type_ == TensorType_UINT8) {
       SymmetricQuantizeAndPopulate(weights_idx, f);
     } else {
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc b/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc
index 494c4495899672979933ffc7108d4ce6e8163610..de1f7818bd0f2a1420b6f277c08670f7e70fef27 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc
@@ -255,7 +255,7 @@ class HybridUnidirectionalRNNOpModel : public UnidirectionalRNNOpModel {
     tensor_type_ = tensor_type;
   }
 
-  void SetWeights(int weights_idx, std::vector<float> f) {
+  void SetWeights(int weights_idx, const std::vector<float>& f) {
     if (tensor_type_ == TensorType_UINT8) {
       SymmetricQuantizeAndPopulate(weights_idx, f);
     } else {
diff --git a/tensorflow/lite/kernels/unique.cc b/tensorflow/lite/kernels/unique.cc
new file mode 100644
index 0000000000000000000000000000000000000000..80c033aa5ce1f0fb302f7b2f06d3e2cae69b9062
--- /dev/null
+++ b/tensorflow/lite/kernels/unique.cc
@@ -0,0 +1,164 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <map>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace unique {
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  return nullptr;
+}
+
+void Free(TfLiteContext* context, void* buffer) {}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  static const int kOutputUniqueTensor = 0;
+  static const int kOutputIndexTensor = 1;
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 2);
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output_unique_tensor =
+      GetOutput(context, node, kOutputUniqueTensor);
+  TfLiteTensor* output_index_tensor =
+      GetOutput(context, node, kOutputIndexTensor);
+
+  // The op only supports 1D input.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 1);
+  TfLiteIntArray* output_index_shape = TfLiteIntArrayCopy(input->dims);
+  // The unique values are determined during evaluation, so we don't know yet
+  // the size of the output tensor.
+  SetTensorToDynamic(output_unique_tensor);
+  return context->ResizeTensor(context, output_index_tensor,
+                               output_index_shape);
+}
+
+namespace {
+
+// Actual evaluation for the unique op.
+template <typename T, typename I>
+TfLiteStatus EvalImpl(TfLiteContext* context, const TfLiteTensor* input,
+                      TfLiteNode* node) {
+  // Map from value, to index in the unique elements vector.
+  // Note that we prefer to use map than unordered_map as it showed less
+  // increase in the binary size.
+  std::map<T, int> unique_values;
+  TfLiteTensor* output_indexes = GetOutput(context, node, 1);
+  I* indexes = GetTensorData<I>(output_indexes);
+  const T* data = GetTensorData<T>(input);
+  const int num_elements = NumElements(input);
+
+  for (int i = 0; i < num_elements; ++i) {
+    const auto element_it = unique_values.find(data[i]);
+    if (element_it != unique_values.end()) {
+      indexes[i] = element_it->second;
+    } else {
+      const int unique_index = unique_values.size();
+      unique_values[data[i]] = unique_index;
+      indexes[i] = unique_index;
+    }
+  }
+  // Allocate output tensor.
+  TfLiteTensor* unique_output = GetOutput(context, node, 0);
+  std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> shape(
+      TfLiteIntArrayCreate(NumDimensions(input)), TfLiteIntArrayFree);
+  shape->data[0] = unique_values.size();
+  TF_LITE_ENSURE_STATUS(
+      context->ResizeTensor(context, unique_output, shape.release()));
+  // Set the values in the output tensor.
+  T* output_unique_values = GetTensorData<T>(unique_output);
+  for (int i = 0; i < unique_values.size(); ++i) {
+    output_unique_values[i] = data[indexes[i]];
+  }
+  return kTfLiteOk;
+}
+
+template <typename T>
+TfLiteStatus EvalImpl(TfLiteContext* context, const TfLiteTensor* input,
+                      TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteUniqueParams*>(node->builtin_data);
+  if (params == nullptr) {
+    context->ReportError(context, "Null params passed");
+    return kTfLiteError;
+  }
+  switch (params->index_out_type) {
+    case kTfLiteInt32:
+      return EvalImpl<T, int32_t>(context, input, node);
+    case kTfLiteInt64:
+      return EvalImpl<T, int64_t>(context, input, node);
+    default:
+      context->ReportError(
+          context,
+          "Unique index output array can only be Int32 or In64, requested: ",
+          TfLiteTypeGetName(params->index_out_type));
+  }
+  return kTfLiteError;
+}
+
+}  // namespace
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output_index_tensor = GetOutput(context, node, 1);
+  TF_LITE_ENSURE_EQ(context, NumElements(output_index_tensor),
+                    NumElements(input));
+
+  switch (input->type) {
+    case kTfLiteInt8:
+      TF_LITE_ENSURE_STATUS(EvalImpl<int8_t>(context, input, node));
+      break;
+    case kTfLiteInt16:
+      TF_LITE_ENSURE_STATUS(EvalImpl<int16_t>(context, input, node));
+      break;
+    case kTfLiteInt32:
+      TF_LITE_ENSURE_STATUS(EvalImpl<int32_t>(context, input, node));
+      break;
+    case kTfLiteInt64:
+      TF_LITE_ENSURE_STATUS(EvalImpl<int64_t>(context, input, node));
+      break;
+    case kTfLiteFloat32:
+      TF_LITE_ENSURE_STATUS(EvalImpl<float>(context, input, node));
+      break;
+    case kTfLiteUInt8:
+      TF_LITE_ENSURE_STATUS(EvalImpl<uint8_t>(context, input, node));
+      break;
+    default:
+      context->ReportError(context, "Currently Unique doesn't support type: %s",
+                           TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace unique
+
+TfLiteRegistration* Register_UNIQUE() {
+  static TfLiteRegistration r = {unique::Init, unique::Free, unique::Prepare,
+                                 unique::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/unique_test.cc b/tensorflow/lite/kernels/unique_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1df5e6b7967ea701c573e6d1f9abc04f0067b65a
--- /dev/null
+++ b/tensorflow/lite/kernels/unique_test.cc
@@ -0,0 +1,103 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+template <typename T, typename I>
+class UniqueOpModel : public SingleOpModel {
+ public:
+  UniqueOpModel(const TensorData& input, TensorType input_type,
+                TensorType index_out_type) {
+    input_id_ = AddInput(input);
+    output_id_ = AddOutput(input_type);
+    output_index_id_ = AddOutput(index_out_type);
+    SetBuiltinOp(BuiltinOperator_UNIQUE, BuiltinOptions_UniqueOptions,
+                 CreateUniqueOptions(builder_, index_out_type).Union());
+    BuildInterpreter({GetShape(input_id_)});
+  }
+
+  int input_tensor_id() { return input_id_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_id_); }
+  std::vector<I> GetIndexesOutput() {
+    return ExtractVector<I>(output_index_id_);
+  }
+
+ protected:
+  int input_id_;
+  int output_id_;
+  int output_index_id_;
+};
+
+TEST(UniqueOpModelTest, OneElement) {
+  UniqueOpModel<float, int32_t> model({TensorType_FLOAT32, {1}},
+                                      TensorType_FLOAT32, TensorType_INT32);
+  model.PopulateTensor<float>(model.input_tensor_id(), {5});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({5}));
+  EXPECT_THAT(model.GetIndexesOutput(), ElementsAreArray({0}));
+}
+
+TEST(UniqueOpModelTest, MultipleElements_AllUnique) {
+  UniqueOpModel<float, int32_t> model({TensorType_FLOAT32, {8}},
+                                      TensorType_FLOAT32, TensorType_INT32);
+  model.PopulateTensor<float>(model.input_tensor_id(),
+                              {5, 2, 3, 51, 6, 72, 7, 8});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({5, 2, 3, 51, 6, 72, 7, 8}));
+  EXPECT_THAT(model.GetIndexesOutput(),
+              ElementsAreArray({0, 1, 2, 3, 4, 5, 6, 7}));
+}
+
+TEST(UniqueOpModelTest, MultipleElements_AllDuplicates) {
+  UniqueOpModel<float, int32_t> model({TensorType_FLOAT32, {7}},
+                                      TensorType_FLOAT32, TensorType_INT32);
+  model.PopulateTensor<float>(model.input_tensor_id(), {5, 5, 5, 5, 5, 5, 5});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({5}));
+  EXPECT_THAT(model.GetIndexesOutput(),
+              ElementsAreArray({0, 0, 0, 0, 0, 0, 0}));
+}
+
+TEST(UniqueOpModelTest, MultipleElements_SomeDuplicates) {
+  UniqueOpModel<float, int32_t> model({TensorType_FLOAT32, {7}},
+                                      TensorType_FLOAT32, TensorType_INT32);
+  model.PopulateTensor<float>(model.input_tensor_id(), {2, 3, 5, 7, 2, 7, 3});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({2, 3, 5, 7}));
+  EXPECT_THAT(model.GetIndexesOutput(),
+              ElementsAreArray({0, 1, 2, 3, 0, 3, 1}));
+}
+
+TEST(UniqueOpModelTest, MultipleElements_SomeDuplicates_IndexInt64) {
+  UniqueOpModel<float, int64_t> model({TensorType_FLOAT32, {7}},
+                                      TensorType_FLOAT32, TensorType_INT64);
+  model.PopulateTensor<float>(model.input_tensor_id(), {2, 3, 5, 7, 2, 7, 3});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({2, 3, 5, 7}));
+  EXPECT_THAT(model.GetIndexesOutput(),
+              ElementsAreArray({0, 1, 2, 3, 0, 3, 1}));
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/models/smartreply/BUILD b/tensorflow/lite/models/smartreply/BUILD
index a3f4e61cb56e20eeedd5de31a79ddd3c25b601ba..fccbf79eb9139c794b7d259f2614dd0212061a14 100644
--- a/tensorflow/lite/models/smartreply/BUILD
+++ b/tensorflow/lite/models/smartreply/BUILD
@@ -58,7 +58,6 @@ tf_cc_test(
         "//tensorflow/lite/models:testdata/smartreply_samples.tsv",
         "@tflite_smartreply//:smartreply.tflite",
     ],
-    tags = ["no_oss"],
     deps = [
         ":predictor_lib",
         "//tensorflow/core:test",
diff --git a/tensorflow/lite/models/smartreply/predictor_test.cc b/tensorflow/lite/models/smartreply/predictor_test.cc
index 9bdd7b537a3aa1b2c659d90454dbaea1a195b979..9ce8ce9a77ddd94efa70f00dc5c88856256c478a 100644
--- a/tensorflow/lite/models/smartreply/predictor_test.cc
+++ b/tensorflow/lite/models/smartreply/predictor_test.cc
@@ -31,12 +31,16 @@ namespace custom {
 namespace smartreply {
 namespace {
 
-const char kModelName[] = "smartreply_ondevice_model.bin";
 const char kSamples[] = "smartreply_samples.tsv";
 
-string TestDataPath() {
+string GetModelFilePath() {
+  return "third_party/tensorflow/lite/models/testdata/"
+         "smartreply_ondevice_model.bin";
+}
+
+string GetSamplesFilePath() {
   return string(absl::StrCat(tensorflow::testing::TensorFlowSrcRoot(), "/",
-                             "lite/models/testdata/"));
+                             "lite/models/testdata/", kSamples));
 }
 
 MATCHER_P(IncludeAnyResponesIn, expected_response, "contains the response") {
@@ -57,8 +61,7 @@ class PredictorTest : public ::testing::Test {
   ~PredictorTest() override {}
 
   void SetUp() override {
-    model_ = tflite::FlatBufferModel::BuildFromFile(
-        absl::StrCat(TestDataPath(), "/", kModelName).c_str());
+    model_ = tflite::FlatBufferModel::BuildFromFile(GetModelFilePath().c_str());
     ASSERT_NE(model_.get(), nullptr);
   }
 
@@ -123,7 +126,7 @@ TEST_F(PredictorTest, BatchTest) {
   int total_triggers = 0;
 
   string line;
-  std::ifstream fin(absl::StrCat(TestDataPath(), "/", kSamples));
+  std::ifstream fin(GetSamplesFilePath());
   while (std::getline(fin, line)) {
     const std::vector<string> fields = absl::StrSplit(line, '\t');
     if (fields.empty()) {
diff --git a/tensorflow/lite/nnapi_delegate.cc b/tensorflow/lite/nnapi_delegate.cc
index 26d75696a1c889d752f9715358701da6300f49df..dc8e81cde758f6d187046d865d42141200f753bc 100644
--- a/tensorflow/lite/nnapi_delegate.cc
+++ b/tensorflow/lite/nnapi_delegate.cc
@@ -686,6 +686,7 @@ TfLiteStatus AddOpsAndParams(
       case tflite::BuiltinOperator_MIRROR_PAD:
       case tflite::BuiltinOperator_ABS:
       case tflite::BuiltinOperator_SPLIT_V:
+      case tflite::BuiltinOperator_UNIQUE:
         logError("Op code %d is currently not delegated to NNAPI", builtin);
         return kTfLiteError;
         break;
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index e666812bd23dd9a17cbfbb0ac8f74d6bb882c33d..54b925cc0514036b582901a0d0c15cc988ddf4a6 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -74,6 +74,7 @@ py_test(
     data = ["@tflite_mobilenet_ssd_quant_protobuf//:tflite_graph.pb"],
     srcs_version = "PY2AND3",
     tags = [
+        "no_oss",
         "no_windows",
     ],
     deps = [
@@ -115,8 +116,6 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/contrib/framework:framework_py",
-        "//tensorflow/contrib/graph_editor:graph_editor_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform",
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 9f315fc8747b30f39ae844230886d55b7d9d7864..9c603998717019ac8624868b16d720e300a30efd 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -37,14 +37,11 @@ from tensorflow.python.util.tf_export import tf_export as _tf_export
 
 # Lazy load since some of the performance benchmark skylark rules
 # break dependencies.
-if lite_constants.EXPERIMENTAL_USE_TOCO_API_DIRECTLY:
-  _toco_python = LazyLoader(
-      "tensorflow_wrap_toco", globals(),
-      "tensorflow.lite.toco.python."
-      "tensorflow_wrap_toco")
-  del LazyLoader
-else:
-  _toco_python = None
+_toco_python = LazyLoader(
+    "tensorflow_wrap_toco", globals(),
+    "tensorflow.lite.toco.python."
+    "tensorflow_wrap_toco")
+del LazyLoader
 
 # Find the toco_from_protos binary using the resource loader if using from
 # bazel, otherwise we are in a pip where console_scripts already has
diff --git a/tensorflow/lite/python/convert_test.py b/tensorflow/lite/python/convert_test.py
index cf49ee2b472d2c6617811cde0978eb8ae3a16f8e..a29d431322f37bcc2b0526b977060545e0c3d70c 100644
--- a/tensorflow/lite/python/convert_test.py
+++ b/tensorflow/lite/python/convert_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework.graph_util_impl import _bfs_for_reachable_nodes
 from tensorflow.python.framework.graph_util_impl import _extract_graph_summary
+from tensorflow.python.framework.graph_util_impl import _node_name
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -389,6 +390,28 @@ class ConvertTestOpHint(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError):
       convert.convert_dtype_to_tflite_type(dtypes.bool)
 
+  def testFindHintedOutputNodes(self):
+    """Test if all hinted output nodes are correctly found."""
+
+    def _build_ophinted_op(name, input1, input2):
+      custom_op = op_hint.OpHint(name)
+      input1 = custom_op.add_input(input1)
+      input2 = custom_op.add_input(input2)
+      output = math_ops.mul(input1, input2)
+      return custom_op.add_output(output)
+
+    output_1 = _build_ophinted_op("custom_op_1", array_ops.constant([1.]),
+                                  array_ops.constant([2.]))
+    output_2 = _build_ophinted_op("custom_op_2", array_ops.constant([3.]),
+                                  array_ops.constant([4.]))
+    with self.cached_session() as sess:
+      hinted_outputs_nodes = op_hint.find_all_hinted_output_nodes(sess)
+      expected_hinted_output_nodes = [
+          _node_name(output_1.name),
+          _node_name(output_2.name)
+      ]
+      self.assertCountEqual(hinted_outputs_nodes, expected_hinted_output_nodes)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/lite/python/op_hint.py b/tensorflow/lite/python/op_hint.py
index 8d7f9316bfe81255510fc5aca9ffdf9671cd64df..6ec050171fc39308c36ec8f43af639f59f4f387c 100644
--- a/tensorflow/lite/python/op_hint.py
+++ b/tensorflow/lite/python/op_hint.py
@@ -964,6 +964,35 @@ def _convert_op_hints_to_stubs_helper(
   return curr_graph_def
 
 
+def find_all_hinted_output_nodes(session=None, graph_def=None):
+  """Find all Ophints output nodes in the graph.
+
+  This is used to get all the output nodes those are ophinted, it is important
+  for operation like convert_variables_to_constants keep all ophints structure.
+  Note: only one of session or graph_def should be used, not both.
+
+  Args:
+    session: A TensorFlow session that contains the graph to convert.
+    graph_def: A graph def that we should convert.
+
+  Returns:
+    A list of OpHints output nodes.
+  Raises:
+    ValueError: If both session and graph_def are provided.
+  """
+  if session is not None and graph_def is not None:
+    raise ValueError("Provide only one of session and graph_def.")
+  hinted_outputs_nodes = []
+  if session is not None:
+    hints = _find_all_hints_in_graph_def(session.graph_def)
+  elif graph_def is not None:
+    hints = _find_all_hints_in_graph_def(graph_def)
+  for hint in _six.itervalues(hints):
+    _, ouput_nodes = hint.flattened_inputs_and_outputs()
+    hinted_outputs_nodes.extend(ouput_nodes)
+  return hinted_outputs_nodes
+
+
 def convert_op_hints_to_stubs(session=None,
                               graph_def=None,
                               write_callback=lambda graph_def, comments: None):
@@ -996,6 +1025,7 @@ def convert_op_hints_to_stubs(session=None,
 
 
 _allowed_symbols = [
-    "OpHint", "convert_op_hints_to_stubs", "convert_op_hints_to_stubs_new"
+    "OpHint", "convert_op_hints_to_stubs", "convert_op_hints_to_stubs_new",
+    "find_all_hinted_output_nodes"
 ]
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/lite/python/tflite_convert.py b/tensorflow/lite/python/tflite_convert.py
index 341b539bead296ca28c1f5f8c17928e553ebabc4..401a592273c9c76f1f371bb8972f7f9a3d494278 100644
--- a/tensorflow/lite/python/tflite_convert.py
+++ b/tensorflow/lite/python/tflite_convert.py
@@ -343,13 +343,13 @@ def run_main(_):
             "floats. Used for quantized input tensors. (default None)"))
   parser.add_argument(
       "--default_ranges_min",
-      type=int,
+      type=float,
       help=("Default value for min bound of min/max range values used for all "
             "arrays without a specified range, Intended for experimenting with "
             "quantization via \"dummy quantization\". (default None)"))
   parser.add_argument(
       "--default_ranges_max",
-      type=int,
+      type=float,
       help=("Default value for max bound of min/max range values used for all "
             "arrays without a specified range, Intended for experimenting with "
             "quantization via \"dummy quantization\". (default None)"))
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index 91d8049301b235624d924c023eb1dd29c5e86689..aca926a3b9753c8649e225bbb7bf7e6201fb2a2d 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -205,6 +205,7 @@ enum BuiltinOperator : byte {
   MIRROR_PAD = 100,
   ABS = 101,
   SPLIT_V = 102,
+  UNIQUE = 103,
 }
 
 // Options for the builtin operators.
@@ -288,6 +289,7 @@ union BuiltinOptions {
   MirrorPadOptions,
   AbsOptions,
   SplitVOptions,
+  UniqueOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -701,6 +703,10 @@ table MirrorPadOptions {
   mode:MirrorPadMode;
 }
 
+table UniqueOptions {
+  idx_out_type:TensorType = INT32;
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index 0883cce497d663c6f5eed768564d7a8624f7295e..1d153cc71778a10f59645ed948f4d61a2ae03b0e 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -268,6 +268,9 @@ struct SquaredDifferenceOptionsT;
 struct MirrorPadOptions;
 struct MirrorPadOptionsT;
 
+struct UniqueOptions;
+struct UniqueOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -520,11 +523,12 @@ enum BuiltinOperator {
   BuiltinOperator_MIRROR_PAD = 100,
   BuiltinOperator_ABS = 101,
   BuiltinOperator_SPLIT_V = 102,
+  BuiltinOperator_UNIQUE = 103,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_SPLIT_V
+  BuiltinOperator_MAX = BuiltinOperator_UNIQUE
 };
 
-inline const BuiltinOperator (&EnumValuesBuiltinOperator())[102] {
+inline const BuiltinOperator (&EnumValuesBuiltinOperator())[103] {
   static const BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -627,7 +631,8 @@ inline const BuiltinOperator (&EnumValuesBuiltinOperator())[102] {
     BuiltinOperator_SQUARED_DIFFERENCE,
     BuiltinOperator_MIRROR_PAD,
     BuiltinOperator_ABS,
-    BuiltinOperator_SPLIT_V
+    BuiltinOperator_SPLIT_V,
+    BuiltinOperator_UNIQUE
   };
   return values;
 }
@@ -737,6 +742,7 @@ inline const char * const *EnumNamesBuiltinOperator() {
     "MIRROR_PAD",
     "ABS",
     "SPLIT_V",
+    "UNIQUE",
     nullptr
   };
   return names;
@@ -828,11 +834,12 @@ enum BuiltinOptions {
   BuiltinOptions_MirrorPadOptions = 77,
   BuiltinOptions_AbsOptions = 78,
   BuiltinOptions_SplitVOptions = 79,
+  BuiltinOptions_UniqueOptions = 80,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_SplitVOptions
+  BuiltinOptions_MAX = BuiltinOptions_UniqueOptions
 };
 
-inline const BuiltinOptions (&EnumValuesBuiltinOptions())[80] {
+inline const BuiltinOptions (&EnumValuesBuiltinOptions())[81] {
   static const BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -913,7 +920,8 @@ inline const BuiltinOptions (&EnumValuesBuiltinOptions())[80] {
     BuiltinOptions_SquaredDifferenceOptions,
     BuiltinOptions_MirrorPadOptions,
     BuiltinOptions_AbsOptions,
-    BuiltinOptions_SplitVOptions
+    BuiltinOptions_SplitVOptions,
+    BuiltinOptions_UniqueOptions
   };
   return values;
 }
@@ -1000,6 +1008,7 @@ inline const char * const *EnumNamesBuiltinOptions() {
     "MirrorPadOptions",
     "AbsOptions",
     "SplitVOptions",
+    "UniqueOptions",
     nullptr
   };
   return names;
@@ -1330,6 +1339,10 @@ template<> struct BuiltinOptionsTraits<SplitVOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_SplitVOptions;
 };
 
+template<> struct BuiltinOptionsTraits<UniqueOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_UniqueOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -1993,6 +2006,14 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_SplitVOptions ?
       reinterpret_cast<const SplitVOptionsT *>(value) : nullptr;
   }
+  UniqueOptionsT *AsUniqueOptions() {
+    return type == BuiltinOptions_UniqueOptions ?
+      reinterpret_cast<UniqueOptionsT *>(value) : nullptr;
+  }
+  const UniqueOptionsT *AsUniqueOptions() const {
+    return type == BuiltinOptions_UniqueOptions ?
+      reinterpret_cast<const UniqueOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -7021,6 +7042,60 @@ inline flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(
 
 flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct UniqueOptionsT : public flatbuffers::NativeTable {
+  typedef UniqueOptions TableType;
+  TensorType idx_out_type;
+  UniqueOptionsT()
+      : idx_out_type(TensorType_INT32) {
+  }
+};
+
+struct UniqueOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef UniqueOptionsT NativeTableType;
+  enum {
+    VT_IDX_OUT_TYPE = 4
+  };
+  TensorType idx_out_type() const {
+    return static_cast<TensorType>(GetField<int8_t>(VT_IDX_OUT_TYPE, 2));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_IDX_OUT_TYPE) &&
+           verifier.EndTable();
+  }
+  UniqueOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(UniqueOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<UniqueOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct UniqueOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_idx_out_type(TensorType idx_out_type) {
+    fbb_.AddElement<int8_t>(UniqueOptions::VT_IDX_OUT_TYPE, static_cast<int8_t>(idx_out_type), 2);
+  }
+  explicit UniqueOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  UniqueOptionsBuilder &operator=(const UniqueOptionsBuilder &);
+  flatbuffers::Offset<UniqueOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<UniqueOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<UniqueOptions> CreateUniqueOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    TensorType idx_out_type = TensorType_INT32) {
+  UniqueOptionsBuilder builder_(_fbb);
+  builder_.add_idx_out_type(idx_out_type);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<UniqueOptions> CreateUniqueOptions(flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   BuiltinOperator builtin_code;
@@ -7391,6 +7466,9 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const SplitVOptions *builtin_options_as_SplitVOptions() const {
     return builtin_options_type() == BuiltinOptions_SplitVOptions ? static_cast<const SplitVOptions *>(builtin_options()) : nullptr;
   }
+  const UniqueOptions *builtin_options_as_UniqueOptions() const {
+    return builtin_options_type() == BuiltinOptions_UniqueOptions ? static_cast<const UniqueOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -7738,6 +7816,10 @@ template<> inline const SplitVOptions *Operator::builtin_options_as<SplitVOption
   return builtin_options_as_SplitVOptions();
 }
 
+template<> inline const UniqueOptions *Operator::builtin_options_as<UniqueOptions>() const {
+  return builtin_options_as_UniqueOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -10356,6 +10438,32 @@ inline flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(flatbuffers:
       _mode);
 }
 
+inline UniqueOptionsT *UniqueOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new UniqueOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void UniqueOptions::UnPackTo(UniqueOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = idx_out_type(); _o->idx_out_type = _e; };
+}
+
+inline flatbuffers::Offset<UniqueOptions> UniqueOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateUniqueOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<UniqueOptions> CreateUniqueOptions(flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const UniqueOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _idx_out_type = _o->idx_out_type;
+  return tflite::CreateUniqueOptions(
+      _fbb,
+      _idx_out_type);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
@@ -10930,6 +11038,10 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const SplitVOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_UniqueOptions: {
+      auto ptr = reinterpret_cast<const UniqueOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -11264,6 +11376,10 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const SplitVOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_UniqueOptions: {
+      auto ptr = reinterpret_cast<const UniqueOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -11586,6 +11702,10 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const SplitVOptionsT *>(value);
       return CreateSplitVOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_UniqueOptions: {
+      auto ptr = reinterpret_cast<const UniqueOptionsT *>(value);
+      return CreateUniqueOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -11908,6 +12028,10 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new SplitVOptionsT(*reinterpret_cast<SplitVOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_UniqueOptions: {
+      value = new UniqueOptionsT(*reinterpret_cast<UniqueOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -12310,6 +12434,11 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_UniqueOptions: {
+      auto ptr = reinterpret_cast<UniqueOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index fa25cfaa69e5cce5b8523da8fa7a29dd5de1fa0f..19c950b4f823723754daada68655ef39861f9d88 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -257,7 +257,7 @@ cc_library(
     ],
 )
 
-cc_test(
+tf_cc_test(
     name = "tf_driver_test",
     size = "small",
     srcs = ["tf_driver_test.cc"],
@@ -286,7 +286,7 @@ cc_library(
     ],
 )
 
-cc_test(
+tf_cc_test(
     name = "generate_testspec_test",
     size = "small",
     srcs = ["generate_testspec_test.cc"],
diff --git a/tensorflow/lite/testing/generate_examples.py b/tensorflow/lite/testing/generate_examples.py
index dd7b3d07456fbd9943e9f45b815e6015f4973a94..87e7d7eb02cf00507a607cf7ce4fbd34d3c016b9 100644
--- a/tensorflow/lite/testing/generate_examples.py
+++ b/tensorflow/lite/testing/generate_examples.py
@@ -3749,6 +3749,55 @@ def make_placeholder_with_default_tests(zip_path):
                     expected_tf_success=3)
 
 
+def make_unique_tests(zip_path):
+  """Make a set of tests for Unique op."""
+
+  test_parameters = [
+      {
+          "input_shape": [[1]],
+          "index_type": [tf.int32, tf.int64, None],
+          "input_values": [3]
+      },
+      {
+          "input_shape": [[5]],
+          "index_type": [tf.int32, tf.int64],
+          "input_values": [[3, 2, 1, 2, 3]]
+      },
+      {
+          "input_shape": [[7]],
+          "index_type": [tf.int32, tf.int64],
+          "input_values": [[1, 1, 1, 1, 1, 1, 1]]
+      },
+      {
+          "input_shape": [[5]],
+          "index_type": [tf.int32, tf.int64],
+          "input_values": [[3, 2, 1, 0, -1]]
+      }]
+
+  def build_graph(parameters):
+    """Build the graph for the test case."""
+
+    input_tensor = tf.placeholder(
+        dtype=tf.int32, name="input", shape=parameters["input_shape"])
+    if parameters["index_type"] is None:
+      output = tf.unique(input_tensor)
+    else:
+      output = tf.unique(input_tensor, parameters["index_type"])
+
+    return [input_tensor], output
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = [create_tensor_data(tf.int32, parameters["input_shape"])]
+    return input_values, sess.run(
+        outputs, feed_dict=dict(zip(inputs, input_values)))
+
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_success=9)
+
 # Toco binary path provided by the generate rule.
 bin_path = None
 
diff --git a/tensorflow/lite/testing/tf_driver_test.cc b/tensorflow/lite/testing/tf_driver_test.cc
index 363d162d56a1670821d29768bc36411bf22d61e9..e79704d616cf59585228851b91c2e93259d84c0b 100644
--- a/tensorflow/lite/testing/tf_driver_test.cc
+++ b/tensorflow/lite/testing/tf_driver_test.cc
@@ -93,7 +93,7 @@ TEST(TfDriverTest, SimpleTest) {
                    {"1,8,8,3", "1,8,8,3", "1,8,8,3", "1,8,8,3"}, {"x", "y"}));
 
   runner->LoadModel(
-      "third_party/tensorflow/lite/testdata/multi_add.pb");
+      "tensorflow/lite/testdata/multi_add.pb");
   EXPECT_TRUE(runner->IsValid()) << runner->GetErrorMessage();
 
   ASSERT_THAT(runner->GetInputs(), ElementsAre(0, 1, 2, 3));
diff --git a/tensorflow/lite/testing/tflite_driver.cc b/tensorflow/lite/testing/tflite_driver.cc
index ffe296432a42a2aef246f170da25ea23487db06d..a637dc86c020d4e16fb4fc02e9f62e8dec6a3a25 100644
--- a/tensorflow/lite/testing/tflite_driver.cc
+++ b/tensorflow/lite/testing/tflite_driver.cc
@@ -79,32 +79,7 @@ class TfLiteDriver::Expectation {
     SetTensorData(values, &data_);
   }
 
-  template <>
-  void SetData<string>(const string& csv_values) {
-    string s = absl::HexStringToBytes(csv_values);
-    data_.raw = new char[s.size()];
-    memcpy(data_.raw, s.data(), s.size());
-  }
-
-  bool Check(bool verbose, const TfLiteTensor& tensor) {
-    switch (tensor.type) {
-      case kTfLiteFloat32:
-        return TypedCheck<float>(verbose, tensor);
-      case kTfLiteInt32:
-        return TypedCheck<int32_t>(verbose, tensor);
-      case kTfLiteInt64:
-        return TypedCheck<int64_t>(verbose, tensor);
-      case kTfLiteUInt8:
-        return TypedCheck<uint8_t>(verbose, tensor);
-      case kTfLiteBool:
-        return TypedCheck<bool>(verbose, tensor);
-      case kTfLiteString:
-        return TypedCheck<string>(verbose, tensor);
-      default:
-        fprintf(stderr, "Unsupported type %d in Check\n", tensor.type);
-        return false;
-    }
-  }
+  bool Check(bool verbose, const TfLiteTensor& tensor);
 
  private:
   template <typename T>
@@ -146,49 +121,77 @@ class TfLiteDriver::Expectation {
     return good_output;
   }
 
-  template <>
-  bool TypedCheck<string>(bool verbose, const TfLiteTensor& tensor) {
-    if (tensor.data.raw == nullptr) {
+  TfLitePtrUnion data_;
+  size_t num_elements_;
+};
+
+template <>
+void TfLiteDriver::Expectation::SetData<string>(const string& csv_values) {
+  string s = absl::HexStringToBytes(csv_values);
+  data_.raw = new char[s.size()];
+  memcpy(data_.raw, s.data(), s.size());
+}
+
+template <>
+bool TfLiteDriver::Expectation::TypedCheck<string>(bool verbose,
+                                                   const TfLiteTensor& tensor) {
+  if (tensor.data.raw == nullptr) {
+    if (verbose) {
+      std::cerr << "  got empty string" << std::endl;
+    }
+    return false;
+  }
+  int expected_num_strings = GetStringCount(data_.raw);
+  int returned_num_strings = GetStringCount(tensor.data.raw);
+  if (expected_num_strings != returned_num_strings) {
+    if (verbose) {
+      std::cerr << "  string count differ: got " << returned_num_strings
+                << ", but expected " << expected_num_strings << std::endl;
+    }
+    return false;
+  }
+  for (int i = 0; i < returned_num_strings; ++i) {
+    auto expected_ref = GetString(data_.raw, i);
+    auto returned_ref = GetString(tensor.data.raw, i);
+    if (expected_ref.len != returned_ref.len) {
       if (verbose) {
-        std::cerr << "  got empty string" << std::endl;
+        std::cerr << "  index " << i << ": got string of size "
+                  << returned_ref.len << ", but expected size "
+                  << expected_ref.len << std::endl;
       }
       return false;
     }
-    int expected_num_strings = GetStringCount(data_.raw);
-    int returned_num_strings = GetStringCount(tensor.data.raw);
-    if (expected_num_strings != returned_num_strings) {
+    if (strncmp(expected_ref.str, returned_ref.str, returned_ref.len) != 0) {
       if (verbose) {
-        std::cerr << "  string count differ: got " << returned_num_strings
-                  << ", but expected " << expected_num_strings << std::endl;
+        std::cerr << "  index " << i << ": strings are different" << std::endl;
       }
       return false;
     }
-    for (int i = 0; i < returned_num_strings; ++i) {
-      auto expected_ref = GetString(data_.raw, i);
-      auto returned_ref = GetString(tensor.data.raw, i);
-      if (expected_ref.len != returned_ref.len) {
-        if (verbose) {
-          std::cerr << "  index " << i << ": got string of size "
-                    << returned_ref.len << ", but expected size "
-                    << expected_ref.len << std::endl;
-        }
-        return false;
-      }
-      if (strncmp(expected_ref.str, returned_ref.str, returned_ref.len) != 0) {
-        if (verbose) {
-          std::cerr << "  index " << i << ": strings are different"
-                    << std::endl;
-        }
-        return false;
-      }
-    }
-
-    return true;
   }
 
-  TfLitePtrUnion data_;
-  size_t num_elements_;
-};
+  return true;
+}
+
+bool TfLiteDriver::Expectation::Check(bool verbose,
+                                      const TfLiteTensor& tensor) {
+  switch (tensor.type) {
+    case kTfLiteFloat32:
+      return TypedCheck<float>(verbose, tensor);
+    case kTfLiteInt32:
+      return TypedCheck<int32_t>(verbose, tensor);
+    case kTfLiteInt64:
+      return TypedCheck<int64_t>(verbose, tensor);
+    case kTfLiteUInt8:
+      return TypedCheck<uint8_t>(verbose, tensor);
+    case kTfLiteBool:
+      return TypedCheck<bool>(verbose, tensor);
+    case kTfLiteString:
+      return TypedCheck<string>(verbose, tensor);
+    default:
+      fprintf(stderr, "Unsupported type %d in Check\n", tensor.type);
+      return false;
+  }
+}
 
 TfLiteDriver::TfLiteDriver(bool use_nnapi, const string& delegate_name,
                            bool reference_kernel)
diff --git a/tensorflow/lite/toco/BUILD b/tensorflow/lite/toco/BUILD
index 93d41fcae14c8130de87471bdce64edad131c11f..40bceedd6a1e8398d25a4c58a3ee69228ae8d868 100644
--- a/tensorflow/lite/toco/BUILD
+++ b/tensorflow/lite/toco/BUILD
@@ -342,13 +342,15 @@ tf_cc_test(
     name = "import_tensorflow_test",
     srcs = ["import_tensorflow_test.cc"],
     deps = [
+        ":toco_port",
         ":toco_tooling",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -385,9 +387,11 @@ tf_cc_test(
     srcs = ["tooling_util_test.cc"],
     deps = [
         ":model",
+        ":toco_port",
         ":tooling_util",
         "//tensorflow/core:lib",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -451,12 +455,13 @@ tf_cc_test(
         ":toco_port",
         ":toco_tooling",
         ":types_proto_cc",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
         "@com_google_absl//absl/strings",
         "//tensorflow/core:lib",
         # We cannot embed the core:ops dependency directly into :toco_tooling as
         # it can conflict with downstream deps when toco is used as a library.
         "//tensorflow/core:ops",
+        "//tensorflow/lite/testing:util",
     ],
 )
 
@@ -468,6 +473,7 @@ tf_cc_test(
     ],
     deps = [
         ":toco_port",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
index cbae6610d7f4703a898d8d6f35351a09cd70173c..6d9aad66b64848fcdee383e9fc76252dd3fd9b54 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -252,6 +252,14 @@ void SetDataTypeForAllOutputs(Model* model, Operator* op,
       SetDataTypeForAllOutputs(model, op, data_type);
       break;
     }
+    case OperatorType::kUnique: {
+      CHECK_EQ(op->outputs.size(), 2);
+      const UniqueOperator* unique_op = static_cast<UniqueOperator*>(op);
+      const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
+      model->GetArray(op->outputs[0]).data_type = data_type;
+      model->GetArray(op->outputs[1]).data_type = unique_op->idx_out_type;
+      break;
+    }
     default: {
       // These operators produce outputs with the same type as their 1st input
       CHECK_GT(op->inputs.size(), 0);
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 0e653f08a04f237c861038639a1469eb62f35dfa..5185afd22ecdf81cfc03dfe9b9ee42467cacf4a4 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -1828,6 +1828,20 @@ void ProcessMirrorPadOperator(Model* model, MirrorPadOperator* op) {
   output_array.copy_shape(output_shape);
 }
 
+void ProcessUniqueOperator(Model* model, UniqueOperator* op) {
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  // We have 2 outputs, the shape of the index tensor, is the same size
+  // as the input array. The unique values tensor, is unknown until runtime.
+  CHECK_EQ(op->outputs.size(), 2);
+  auto& idx_output_array = model->GetArray(op->outputs[1]);
+
+  // Yield until input dims have been resolved, or output already computed
+  if (!input_array.has_shape() || idx_output_array.has_shape()) {
+    return;
+  }
+  idx_output_array.copy_shape(input_array.shape());
+}
+
 }  // namespace
 
 ::tensorflow::Status PropagateFixedSizes::Run(Model* model,
@@ -2103,6 +2117,9 @@ void ProcessMirrorPadOperator(Model* model, MirrorPadOperator* op) {
     case OperatorType::kMirrorPad:
       ProcessMirrorPadOperator(model, static_cast<MirrorPadOperator*>(op));
       break;
+    case OperatorType::kUnique:
+      ProcessUniqueOperator(model, static_cast<UniqueOperator*>(op));
+      break;
     default:
       // Unimplemented, another graph transformation should drop it.
       LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(op->type);
diff --git a/tensorflow/lite/toco/import_tensorflow.cc b/tensorflow/lite/toco/import_tensorflow.cc
index e1f7eb82ee00b0ef43155ea3434c3a2b0ce7cd12..86e04b2393ce81e748bdcb803375393d92885896 100644
--- a/tensorflow/lite/toco/import_tensorflow.cc
+++ b/tensorflow/lite/toco/import_tensorflow.cc
@@ -1190,7 +1190,7 @@ enum FlexSupport { kFlexOk, kFlexNotOk };
 // taken from the given NodeDef, and its number must match NumInputs, unless
 // kAnyNumInputs is passed in. If kFlexOk is passed in the resulting operator
 // will be eligible for being exported as a flex op.
-template <typename Op, int NumInputs, FlexSupport flex>
+template <typename Op, int NumInputs, int NumOutputs, FlexSupport flex>
 tensorflow::Status ConvertSimpleOperatorGeneric(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
@@ -1203,6 +1203,11 @@ tensorflow::Status ConvertSimpleOperatorGeneric(
     op->inputs.push_back(node.input(i));
   }
   op->outputs.push_back(node.name());
+  if (NumOutputs > 1) {
+    for (int i = 1; i < NumOutputs; ++i) {
+      op->outputs.push_back(node.name() + ":" + std::to_string(i));
+    }
+  }
 
   if (flex == kFlexOk) {
     RetainTensorFlowNodeDef(node, op);
@@ -1213,20 +1218,20 @@ tensorflow::Status ConvertSimpleOperatorGeneric(
 }
 
 // Convert a simple operator which is not valid as a flex op.
-template <typename Op, int NumInputs = kAnyNumInputs>
+template <typename Op, int NumInputs, int NumOutputs>
 tensorflow::Status ConvertSimpleOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
-  return ConvertSimpleOperatorGeneric<Op, NumInputs, kFlexNotOk>(
+  return ConvertSimpleOperatorGeneric<Op, NumInputs, NumOutputs, kFlexNotOk>(
       node, tf_import_flags, model);
 }
 
 // Convert a simple operator which is valid as a flex op.
-template <typename Op, int NumInputs = kAnyNumInputs>
+template <typename Op, int NumInputs, int NumOutputs>
 tensorflow::Status ConvertSimpleOperatorFlexOk(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
-  return ConvertSimpleOperatorGeneric<Op, NumInputs, kFlexOk>(
+  return ConvertSimpleOperatorGeneric<Op, NumInputs, NumOutputs, kFlexOk>(
       node, tf_import_flags, model);
 }
 
@@ -2333,14 +2338,15 @@ ConverterMapType GetTensorFlowNodeConverterMapForFlex() {
 
 ConverterMapType GetTensorFlowNodeConverterMap() {
   return std::unordered_map<std::string, ConverterType>({
-      {"Abs", ConvertSimpleOperator<AbsOperator>},
-      {"Add", ConvertSimpleOperator<AddOperator, 2>},
-      {"AddN", ConvertSimpleOperatorFlexOk<AddNOperator>},
-      {"All", ConvertSimpleOperator<TensorFlowAllOperator>},
+      {"Abs", ConvertSimpleOperator<AbsOperator, kAnyNumInputs, 1>},
+      {"Add", ConvertSimpleOperator<AddOperator, 2, 1>},
+      {"AddN", ConvertSimpleOperatorFlexOk<AddNOperator, kAnyNumInputs, 1>},
+      {"All", ConvertSimpleOperator<TensorFlowAllOperator, kAnyNumInputs, 1>},
       {"Any", ConvertReduceOperator<TensorFlowAnyOperator>},
       {"ArgMax", ConvertArgMaxOperator},
       {"ArgMin", ConvertArgMinOperator},
-      {"Assert", ConvertSimpleOperator<TensorFlowAssertOperator>},
+      {"Assert",
+       ConvertSimpleOperator<TensorFlowAssertOperator, kAnyNumInputs, 1>},
       {"AvgPool", ConvertAvgPoolOperator},
       {"BatchMatMul", ConvertBatchMatMulOperator},
       {"BatchNormWithGlobalNormalization",
@@ -2357,98 +2363,99 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"CTCBeamSearchDecoder", ConvertCTCBeamSearchDecoderOperator},
       {"DepthToSpace", ConvertDepthToSpaceOperator},
       {"DepthwiseConv2dNative", ConvertDepthwiseConvOperator},
-      {"Div", ConvertSimpleOperator<DivOperator, 2>},
+      {"Div", ConvertSimpleOperator<DivOperator, 2, 1>},
       {"DynamicPartition", ConvertDynamicPartitionOperator},
       {"DynamicStitch", ConvertDynamicStitchOperator},
-      {"Equal", ConvertSimpleOperator<TensorFlowEqualOperator, 2>},
-      {"Exp", ConvertSimpleOperator<ExpOperator, 1>},
-      {"ExpandDims", ConvertSimpleOperator<ExpandDimsOperator, 2>},
+      {"Equal", ConvertSimpleOperator<TensorFlowEqualOperator, 2, 1>},
+      {"Exp", ConvertSimpleOperator<ExpOperator, 1, 1>},
+      {"ExpandDims", ConvertSimpleOperator<ExpandDimsOperator, 2, 1>},
       {"FakeQuantWithMinMaxArgs", ConvertFakeQuantWithMinMaxArgs},
       {"FakeQuantWithMinMaxVars", ConvertFakeQuantWithMinMaxVars},
-      {"Fill", ConvertSimpleOperator<FillOperator, 2>},
+      {"Fill", ConvertSimpleOperator<FillOperator, 2, 1>},
       {"Floor", ConvertFloorOperator},
-      {"FloorDiv", ConvertSimpleOperator<FloorDivOperator, 2>},
-      {"FloorMod", ConvertSimpleOperator<FloorModOperator, 2>},
+      {"FloorDiv", ConvertSimpleOperator<FloorDivOperator, 2, 1>},
+      {"FloorMod", ConvertSimpleOperator<FloorModOperator, 2, 1>},
       {"FusedBatchNorm", ConvertFusedBatchNormOperator},
       {"Gather", ConvertGatherOperator},
       {"GatherV2", ConvertGatherOperator},
-      {"Greater", ConvertSimpleOperator<TensorFlowGreaterOperator, 2>},
+      {"Greater", ConvertSimpleOperator<TensorFlowGreaterOperator, 2, 1>},
       {"GreaterEqual",
-       ConvertSimpleOperator<TensorFlowGreaterEqualOperator, 2>},
+       ConvertSimpleOperator<TensorFlowGreaterEqualOperator, 2, 1>},
       {"Identity", ConvertIdentityOperator},
       {"LRN", ConvertLRNOperator},
       {"LeakyRelu", ConvertLeakyReluOperator},
       {"LegacyFedInput", ConvertPlaceholderOperator},
-      {"Less", ConvertSimpleOperator<TensorFlowLessOperator, 2>},
-      {"LessEqual", ConvertSimpleOperator<TensorFlowLessEqualOperator, 2>},
-      {"Log", ConvertSimpleOperator<LogOperator, 1>},
-      {"LogicalAnd", ConvertSimpleOperator<LogicalAndOperator, 2>},
-      {"LogicalOr", ConvertSimpleOperator<LogicalOrOperator, 2>},
-      {"LogicalNot", ConvertSimpleOperator<LogicalNotOperator, 1>},
-      {"LogSoftmax", ConvertSimpleOperator<LogSoftmaxOperator, 1>},
+      {"Less", ConvertSimpleOperator<TensorFlowLessOperator, 2, 1>},
+      {"LessEqual", ConvertSimpleOperator<TensorFlowLessEqualOperator, 2, 1>},
+      {"Log", ConvertSimpleOperator<LogOperator, 1, 1>},
+      {"LogicalAnd", ConvertSimpleOperator<LogicalAndOperator, 2, 1>},
+      {"LogicalOr", ConvertSimpleOperator<LogicalOrOperator, 2, 1>},
+      {"LogicalNot", ConvertSimpleOperator<LogicalNotOperator, 1, 1>},
+      {"LogSoftmax", ConvertSimpleOperator<LogSoftmaxOperator, 1, 1>},
       {"MatMul", ConvertMatMulOperator},
       {"Max", ConvertReduceOperator<TensorFlowMaxOperator>},
       {"MaxPool", ConvertMaxPoolOperator},
-      {"Maximum", ConvertSimpleOperator<TensorFlowMaximumOperator, 2>},
+      {"Maximum", ConvertSimpleOperator<TensorFlowMaximumOperator, 2, 1>},
       {"Mean", ConvertReduceOperator<MeanOperator>},
-      {"Merge", ConvertSimpleOperator<TensorFlowMergeOperator, 2>},
+      {"Merge", ConvertSimpleOperator<TensorFlowMergeOperator, 2, 1>},
       {"Min", ConvertReduceOperator<TensorFlowMinOperator>},
-      {"Minimum", ConvertSimpleOperator<TensorFlowMinimumOperator, 2>},
-      {"Mul", ConvertSimpleOperator<MulOperator, 2>},
-      {"Neg", ConvertSimpleOperator<NegOperator, 1>},
+      {"Minimum", ConvertSimpleOperator<TensorFlowMinimumOperator, 2, 1>},
+      {"Mul", ConvertSimpleOperator<MulOperator, 2, 1>},
+      {"Neg", ConvertSimpleOperator<NegOperator, 1, 1>},
       {"NextIteration", ConvertOperatorSpecialCasedAsRNNBackEdge},
       {"NoOp", ConvertNoOpOperator},
-      {"NotEqual", ConvertSimpleOperator<TensorFlowNotEqualOperator, 2>},
+      {"NotEqual", ConvertSimpleOperator<TensorFlowNotEqualOperator, 2, 1>},
       {"OneHot", ConvertOneHotOperator},
       {"Pack", ConvertPackOperator},
-      {"Pad", ConvertSimpleOperator<PadOperator, 2>},
-      {"PadV2", ConvertSimpleOperator<PadV2Operator, 3>},
+      {"Pad", ConvertSimpleOperator<PadOperator, 2, 1>},
+      {"PadV2", ConvertSimpleOperator<PadV2Operator, 3, 1>},
       {"ParallelDynamicStitch", ConvertDynamicStitchOperator},
       {"Placeholder", ConvertPlaceholderOperator},
       {"PlaceholderWithDefault", ConvertIdentityOperator},
-      {"Pow", ConvertSimpleOperator<PowOperator, 2>},
+      {"Pow", ConvertSimpleOperator<PowOperator, 2, 1>},
       {"Prod", ConvertReduceOperator<TensorFlowProdOperator>},
       {"RandomUniform", ConvertRandomUniform},
       {"Range", ConvertRangeOperator},
-      {"Rank", ConvertSimpleOperator<RankOperator, 1>},
-      {"RealDiv", ConvertSimpleOperator<DivOperator, 2>},
-      {"Relu", ConvertSimpleOperator<ReluOperator, 1>},
-      {"Relu6", ConvertSimpleOperator<Relu6Operator, 1>},
-      {"Reshape", ConvertSimpleOperator<TensorFlowReshapeOperator, 2>},
+      {"Rank", ConvertSimpleOperator<RankOperator, 1, 1>},
+      {"RealDiv", ConvertSimpleOperator<DivOperator, 2, 1>},
+      {"Relu", ConvertSimpleOperator<ReluOperator, 1, 1>},
+      {"Relu6", ConvertSimpleOperator<Relu6Operator, 1, 1>},
+      {"Reshape", ConvertSimpleOperator<TensorFlowReshapeOperator, 2, 1>},
       {"ResizeBilinear", ConvertResizeBilinearOperator},
       {"ResizeNearestNeighbor", ConvertResizeNearestNeighborOperator},
-      {"Rsqrt", ConvertSimpleOperator<TensorFlowRsqrtOperator, 1>},
-      {"Select", ConvertSimpleOperator<SelectOperator, 3>},
+      {"Rsqrt", ConvertSimpleOperator<TensorFlowRsqrtOperator, 1, 1>},
+      {"Select", ConvertSimpleOperator<SelectOperator, 3, 1>},
       {"Shape", ConvertShapeOperator},
-      {"Sigmoid", ConvertSimpleOperator<LogisticOperator, 1>},
-      {"Sin", ConvertSimpleOperator<SinOperator, 1>},
-      {"Slice", ConvertSimpleOperator<SliceOperator, 3>},
+      {"Sigmoid", ConvertSimpleOperator<LogisticOperator, 1, 1>},
+      {"Sin", ConvertSimpleOperator<SinOperator, 1, 1>},
+      {"Slice", ConvertSimpleOperator<SliceOperator, 3, 1>},
       {"Softmax", ConvertSoftmaxOperator},
       {"SpaceToBatchND", ConvertSpaceToBatchNDOperator},
       {"SpaceToDepth", ConvertSpaceToDepthOperator},
       {"SparseToDense", ConvertSparseToDenseOperator},
       {"Split", ConvertSplitOperator},
       {"SplitV", ConvertSplitVOperator},
-      {"Sqrt", ConvertSimpleOperator<TensorFlowSqrtOperator, 1>},
-      {"Square", ConvertSimpleOperator<TensorFlowSquareOperator, 1>},
+      {"Sqrt", ConvertSimpleOperator<TensorFlowSqrtOperator, 1, 1>},
+      {"Square", ConvertSimpleOperator<TensorFlowSquareOperator, 1, 1>},
       {"SquaredDifference",
-       ConvertSimpleOperator<SquaredDifferenceOperator, 2>},
+       ConvertSimpleOperator<SquaredDifferenceOperator, 2, 1>},
       {"Squeeze", ConvertSqueezeOperator},
       {"StopGradient", ConvertIdentityOperator},
       {"StridedSlice", ConvertStridedSliceOperator},
-      {"Sub", ConvertSimpleOperator<SubOperator, 2>},
+      {"Sub", ConvertSimpleOperator<SubOperator, 2, 1>},
       {"Sum", ConvertReduceOperator<TensorFlowSumOperator>},
       {"Svdf", ConvertSvdfOperator},
       {"Switch", ConvertSwitchOperator},
-      {"Tanh", ConvertSimpleOperator<TanhOperator, 1>},
-      {"Tile", ConvertSimpleOperator<TensorFlowTileOperator, 2>},
+      {"Tanh", ConvertSimpleOperator<TanhOperator, 1, 1>},
+      {"Tile", ConvertSimpleOperator<TensorFlowTileOperator, 2, 1>},
       {"TopK", ConvertTopKV2Operator},
       {"TopKV2", ConvertTopKV2Operator},
-      {"Transpose", ConvertSimpleOperator<TransposeOperator, 2>},
+      {"Transpose", ConvertSimpleOperator<TransposeOperator, 2, 1>},
       {"Unpack", ConvertUnpackOperator},
-      {"ZerosLike", ConvertSimpleOperator<TensorFlowZerosLikeOperator, 1>},
+      {"ZerosLike", ConvertSimpleOperator<TensorFlowZerosLikeOperator, 1, 1>},
       {"UnidirectionalSequenceLstm", ConvertUnidirectionalSequenceLstm},
       {"MirrorPad", ConvertMirrorPadOperator},
+      {"Unique", ConvertSimpleOperator<UniqueOperator, 1, 2>},
   });
 }
 
diff --git a/tensorflow/lite/toco/import_tensorflow_test.cc b/tensorflow/lite/toco/import_tensorflow_test.cc
index 260704fd2ab66d4dc6e980ad0b8be598cca3cb2f..de7f4cdb7e3d81f4e1adc97932e5969ce2385a6f 100644
--- a/tensorflow/lite/toco/import_tensorflow_test.cc
+++ b/tensorflow/lite/toco/import_tensorflow_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/toco/import_tensorflow.h"
+#include "tensorflow/lite/toco/toco_port.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -23,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/lite/testing/util.h"
 
 namespace toco {
 
@@ -564,3 +566,10 @@ TEST(ImportTest, UnsupportedOpWithMultipleOutputs) {
 
 }  // namespace
 }  // namespace toco
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  ::toco::port::InitGoogleWasDoneElsewhere();
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/toco/model.h b/tensorflow/lite/toco/model.h
index e71d36583e8ca3e94ef3aae699b3df4e4dfdd981..bfa86c805944fc585b77bf6a27a80a69c4c5f20a 100644
--- a/tensorflow/lite/toco/model.h
+++ b/tensorflow/lite/toco/model.h
@@ -157,7 +157,8 @@ enum class OperatorType : uint8 {
   kResizeNearestNeighbor,
   kLeakyRelu,
   kAbs,
-  kMirrorPad
+  kMirrorPad,
+  kUnique
 };
 
 // Helper to deal with TensorFlow arrays using a different ordering of
@@ -1953,6 +1954,17 @@ struct MirrorPadOperator : Operator {
   MirrorPadMode mode;
 };
 
+// Unique Operator:
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Unique
+struct UniqueOperator : Operator {
+  UniqueOperator() : Operator(OperatorType::kUnique) {}
+  ArrayDataType idx_out_type = ArrayDataType::kInt32;
+};
+
 // Alloc's are used for transient arrays only. An Alloc specifies which interval
 // of the "transient_data" workspace buffer passed to inference functions, is to
 // be used for the transient array at hand. The 'start' and 'end' values are
diff --git a/tensorflow/lite/toco/python/BUILD b/tensorflow/lite/toco/python/BUILD
index f3ea5b7bfe3d82a6608cb5e7358b397f22453a0f..8a6e82ec46445b5ec5440de129177eae836f8db8 100644
--- a/tensorflow/lite/toco/python/BUILD
+++ b/tensorflow/lite/toco/python/BUILD
@@ -10,20 +10,35 @@ load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "py_binary")
 
+config_setting(
+    name = "tflite_convert_with_select_tf_ops",
+    define_values = {"tflite_convert_with_select_tf_ops": "true"},
+    visibility = [
+        "//tensorflow/contrib/lite:__subpackages__",
+        "//tensorflow/lite:__subpackages__",
+    ],
+)
+
 cc_library(
     name = "toco_python_api",
     srcs = ["toco_python_api.cc"],
     hdrs = ["toco_python_api.h"],
     deps = [
+        "//third_party/python_runtime:headers",
         "//tensorflow/core:lib",
-        "//tensorflow/core:ops",
         "//tensorflow/lite/toco:model_flags_proto_cc",
         "//tensorflow/lite/toco:toco_flags_proto_cc",
         "//tensorflow/lite/toco:toco_graphviz_dump_options",
         "//tensorflow/lite/toco:toco_port",
         "//tensorflow/lite/toco:toco_tooling",
-        "//third_party/python_runtime:headers",
-    ],
+    ] + select({
+        # This is required when running `tflite_convert` from `bazel`.
+        # It requires to link with TensorFlow Ops to get the op definitions.
+        ":tflite_convert_with_select_tf_ops": [
+            "//tensorflow/core:ops",
+        ],
+        "//conditions:default": [],
+    }),
 )
 
 tf_py_wrap_cc(
diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index d3fce6893f75d55ed54184d3dc545e38f10d9402..8eb4d321ef020bc9fb6ac3dc86ea262170f3835b 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/util/ptr_util.h"
 // TODO(ycling): Consider refactoring to extract the LSTM definition out of
 // graph_transformation module.
+#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/toco/graph_transformations/lstm_utils.h"
 #include "tensorflow/lite/toco/model.h"
 #include "tensorflow/lite/toco/tflite/builtin_operator.h"
@@ -1478,6 +1479,31 @@ class MirrorPad
                    : MirrorPadMode::kSymmetric;
   }
 
+  int GetVersion(const OperatorSignature& op) const override { return 1; }
+};
+
+class Unique : public BuiltinOperator<UniqueOperator, ::tflite::UniqueOptions,
+                                      ::tflite::BuiltinOptions_UniqueOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    const UniqueOperator& unique_op = static_cast<const UniqueOperator&>(op);
+    return ::tflite::CreateUniqueOptions(
+        *builder, unique_op.idx_out_type == toco::ArrayDataType::kInt64
+                      ? ::tflite::TensorType::TensorType_INT64
+                      : ::tflite::TensorType_INT32);
+  }
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    UniqueOperator* unique_op = static_cast<UniqueOperator*>(op);
+    unique_op->idx_out_type =
+        options.idx_out_type() == ::tflite::TensorType_INT64
+            ? toco::ArrayDataType::kInt64
+            : toco::ArrayDataType::kInt32;
+  }
+
   int GetVersion(const OperatorSignature& op_signature) const override {
     return 1;
   }
@@ -1819,6 +1845,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
       OperatorType::kSquaredDifference));
   ops.push_back(MakeUnique<MirrorPad>(::tflite::BuiltinOperator_MIRROR_PAD,
                                       OperatorType::kMirrorPad));
+  ops.push_back(MakeUnique<Unique>(::tflite::BuiltinOperator_UNIQUE,
+                                   OperatorType::kUnique));
 
   // Custom Operators.
   ops.push_back(
diff --git a/tensorflow/lite/toco/tflite/operator_test.cc b/tensorflow/lite/toco/tflite/operator_test.cc
index 849eace8cc5f8b71f509389961c01055495763dc..215eda82f6df154574d7bd290ff329259edcc391 100644
--- a/tensorflow/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/lite/toco/tflite/operator_test.cc
@@ -629,6 +629,15 @@ TEST_F(OperatorTest, BuiltinMirrorPad) {
   EXPECT_EQ(op.mode, output_toco_op->mode);
 }
 
+TEST_F(OperatorTest, BuiltinUnique) {
+  UniqueOperator op;
+  op.idx_out_type = ArrayDataType::kInt64;
+  auto output_toco_op =
+      SerializeAndDeserialize(GetOperator("UNIQUE", OperatorType::kUnique), op);
+  ASSERT_NE(nullptr, output_toco_op.get());
+  EXPECT_EQ(output_toco_op->idx_out_type, op.idx_out_type);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/toco/toco_convert_test.cc b/tensorflow/lite/toco/toco_convert_test.cc
index c3c440db94396def2f8cfd40242642767d11a63a..739b924607e7aa60bcdb6f081de52aed65a87d58 100644
--- a/tensorflow/lite/toco/toco_convert_test.cc
+++ b/tensorflow/lite/toco/toco_convert_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 #include "tensorflow/lite/toco/toco_convert.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/toco/toco_port.h"
 
 namespace toco {
 namespace {
@@ -171,3 +173,10 @@ TEST(TocoTest, TransientStringTensors) {
 
 }  // namespace
 }  // namespace toco
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  ::toco::port::InitGoogleWasDoneElsewhere();
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/toco/toco_port.cc b/tensorflow/lite/toco/toco_port.cc
index fb8c1b8337f1e509ed9c9ee2522e63e84d143927..b222032e61418224efddbae2c6ec2f110286ab0b 100644
--- a/tensorflow/lite/toco/toco_port.cc
+++ b/tensorflow/lite/toco/toco_port.cc
@@ -57,6 +57,11 @@ void InitGoogle(const char* usage, int* argc, char*** argv, bool remove_flags) {
   ::InitGoogle(usage, argc, argv, remove_flags);
 }
 
+void InitGoogleWasDoneElsewhere() {
+  // Nothing need be done since ::CheckInitGoogleIsDone() is aware of other
+  // possible initialization entry points.
+}
+
 void CheckInitGoogleIsDone(const char* message) {
   ::CheckInitGoogleIsDone(message);
 }
@@ -152,6 +157,8 @@ constexpr int kFileWriteFlags = O_CREAT | O_WRONLY;
 
 static bool port_initialized = false;
 
+void InitGoogleWasDoneElsewhere() { port_initialized = true; }
+
 void InitGoogle(const char* usage, int* argc, char*** argv, bool remove_flags) {
   if (!port_initialized) {
 #if defined(PLATFORM_GOOGLE)
diff --git a/tensorflow/lite/toco/toco_port.h b/tensorflow/lite/toco/toco_port.h
index 2f39e3d6d5c02457e9ade320e7525fbf881b5389..231612ecd43f3d77fc959a38642690ff6beed19b 100644
--- a/tensorflow/lite/toco/toco_port.h
+++ b/tensorflow/lite/toco/toco_port.h
@@ -55,6 +55,10 @@ double round(double x);
 namespace toco {
 namespace port {
 
+// Things like tests use other initialization routines that need control
+// of flags. However, for testing we still want to use toco_port.h facilities.
+// This function sets initialized flag trivially.
+void InitGoogleWasDoneElsewhere();
 void InitGoogle(const char* usage, int* argc, char*** argv, bool remove_flags);
 void CheckInitGoogleIsDone(const char* message);
 
diff --git a/tensorflow/lite/toco/toco_port_test.cc b/tensorflow/lite/toco/toco_port_test.cc
index f5fbb4caeb2882d51c4b586293eb202fcf60a9de..997da58b8f64386dfbf6e41ff5838373dd8d64c2 100644
--- a/tensorflow/lite/toco/toco_port_test.cc
+++ b/tensorflow/lite/toco/toco_port_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/toco/toco_port.h"
+#include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/toco/toco_types.h"
 
 #include <gmock/gmock.h>
@@ -56,3 +57,10 @@ TEST(TocoPortTest, JoinPath) {
 }  // namespace
 }  // namespace port
 }  // namespace toco
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  ::toco::port::InitGoogleWasDoneElsewhere();
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/toco/tooling_util.cc b/tensorflow/lite/toco/tooling_util.cc
index af4cd386a209d82cb56a877410abe6fbdbf99c7b..2396de1a3dff75b8030dd83497d20a24b611269a 100644
--- a/tensorflow/lite/toco/tooling_util.cc
+++ b/tensorflow/lite/toco/tooling_util.cc
@@ -416,6 +416,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(LeakyRelu)
     HANDLE_OPERATORTYPENAME_CASE(SquaredDifference)
     HANDLE_OPERATORTYPENAME_CASE(MirrorPad)
+    HANDLE_OPERATORTYPENAME_CASE(Unique)
     default:
       LOG(FATAL) << "Unhandled op type";
 #undef HANDLE_OPERATORTYPENAME_CASE
diff --git a/tensorflow/lite/toco/tooling_util_test.cc b/tensorflow/lite/toco/tooling_util_test.cc
index 6f1c9c563ada01891b67094caa93cfd1847cdf6b..faa6fe412ec1fd6236f1966f238dd21dfcf9395b 100644
--- a/tensorflow/lite/toco/tooling_util_test.cc
+++ b/tensorflow/lite/toco/tooling_util_test.cc
@@ -16,9 +16,11 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/toco_port.h"
 #include "tensorflow/lite/toco/tooling_util.h"
-#include "tensorflow/core/lib/core/status.h"
 
 namespace toco {
 
@@ -203,3 +205,10 @@ TEST(FusedActivationTest, DefaultsToUnfused) {
 }
 
 }  // namespace toco
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  ::toco::port::InitGoogleWasDoneElsewhere();
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
index 9a74e221c13e72c286512175a7f633c87f75eedd..129747fe4d5c93630f9f6552a9486cbe8f8c37b7 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
@@ -22,6 +22,12 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/lite/tools/accuracy/eval_pipeline.h"
 #include "tensorflow/lite/tools/accuracy/eval_pipeline_builder.h"
 #include "tensorflow/lite/tools/accuracy/file_reader_stage.h"
@@ -29,12 +35,6 @@ limitations under the License.
 #include "tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h"
 #include "tensorflow/lite/tools/accuracy/run_tflite_model_stage.h"
 #include "tensorflow/lite/tools/accuracy/utils.h"
-#include "tensorflow/core/lib/core/blocking_counter.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 
 namespace {
 using tensorflow::string;
@@ -185,21 +185,17 @@ Status EvaluateModelForShard(const uint64_t shard_id,
   const TensorShape& input_shape = model_info.input_shapes[0];
   const int image_height = input_shape.dim_size(1);
   const int image_width = input_shape.dim_size(2);
-  const bool is_quantized = (model_info.input_types[0] == DT_UINT8);
 
   RunTFLiteModelStage::Params tfl_model_params;
   tfl_model_params.model_file_path = params.model_file_path;
-  if (is_quantized) {
-    tfl_model_params.input_type = {DT_UINT8};
-    tfl_model_params.output_type = {DT_UINT8};
-  } else {
-    tfl_model_params.input_type = {DT_FLOAT};
-    tfl_model_params.output_type = {DT_FLOAT};
-  }
+
+  tfl_model_params.input_type = {model_info.input_types[0]};
+  tfl_model_params.output_type = {model_info.input_types[0]};
 
   Scope root = Scope::NewRootScope();
   FileReaderStage reader;
-  InceptionPreprocessingStage inc(image_height, image_width, is_quantized);
+  InceptionPreprocessingStage inc(image_height, image_width,
+                                  model_info.input_types[0]);
   RunTFLiteModelStage tfl_model_stage(tfl_model_params);
   EvalPipelineBuilder builder;
 
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc
index 2b086cdf7075d7e6328ce0a41b17ca611ea3c4e2..f5642d52a89d86930023fd21a6d81e628073927c 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc
@@ -67,11 +67,18 @@ Status ImagenetTopKAccuracy::ComputeEval(
     for (size_t i = 0; i < probs.size(); i++) {
       probabilities.push_back(probs(i));
     }
-  } else {
+  } else if (output.dtype() == DT_UINT8) {
     auto probs = output.flat<uint8>();
     for (size_t i = 0; i < probs.size(); i++) {
       probabilities.push_back(probs(i));
     }
+  } else if (output.dtype() == DT_INT8) {
+    auto probs = output.flat<int8>();
+    for (size_t i = 0; i < probs.size(); i++) {
+      probabilities.push_back(probs(i));
+    }
+  } else {
+    return errors::InvalidArgument("Invalid datatype");
   }
 
   CHECK_EQ(kNumCategories, probabilities.size());
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc
index 9a889f0dd88bc4c51b2c060baf0e89c126c98c1f..04b6cb755892bd218d899587bd81b818a51f85d8 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc
@@ -57,23 +57,33 @@ void InceptionPreprocessingStage::AddToGraph(const Scope& scope,
   tensorflow::Output cropped_image;
   CentralCropImage(s, decoded_jpeg, params_.cropping_fraction, &cropped_image);
   auto dims_expander = ops::ExpandDims(s, cropped_image, 0);
-  auto resized_image = ops::ResizeBilinear(
-      s, dims_expander,
-      ops::Const(s.WithOpName("size"), {image_height_, image_width_}));
-  if (is_quantized_) {
-    this->stage_output_ =
-        ops::Cast(s.WithOpName(output_name()), resized_image, DT_UINT8);
-  } else {
-    auto squeezed_image = ops::Squeeze(s, resized_image);
-    auto normalized_image =
-        ops::Div(s,
-                 ops::Sub(s, squeezed_image,
-                          {params_.input_means[0], params_.input_means[1],
-                           params_.input_means[2]}),
-                 {params_.scale});
-    this->stage_output_ =
-        ops::ExpandDims(s.WithOpName(output_name()), normalized_image, {0});
+  auto resized_image =
+      ops::ResizeBilinear(s.WithOpName("resize"), dims_expander,
+                          ops::Const(s, {image_height_, image_width_}));
+
+  ::tensorflow::Output preprocessed_image = resized_image;
+
+  if (!params_.input_means.empty()) {
+    preprocessed_image =
+        ops::Sub(s.WithOpName("sub"), preprocessed_image,
+                 {params_.input_means[0], params_.input_means[1],
+                  params_.input_means[2]});
+  }
+
+  if (std::abs(params_.scale) > 1e-7f) {
+    auto squeezed_image = ops::Squeeze(s, preprocessed_image);
+    preprocessed_image = ops::Div(s, squeezed_image, {params_.scale});
+    preprocessed_image = ops::ExpandDims(s, preprocessed_image, {0});
   }
+
+  // Cast the output from float to output datatype.
+  if (output_datatype_ != DT_FLOAT) {
+    preprocessed_image =
+        ops::Cast(s.WithOpName("cast"), preprocessed_image, output_datatype_);
+  }
+
+  this->stage_output_ =
+      ops::Identity(s.WithOpName(output_name()), preprocessed_image);
 }
 
 }  // namespace metrics
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h
index 4a1d3ce4769d1a7d3f46f39941eb3e9bcde7785c..371feb3e76515a714286983a393c10dbaf4be3c8 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_INCEPTION_PREPROCESSING_H_
-#define TENSORFLOW_LITE_TOOLS_ACCURACY_INCEPTION_PREPROCESSING_H_
+#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_ILSVRC_INCEPTION_PREPROCESSING_H_
+#define TENSORFLOW_LITE_TOOLS_ACCURACY_ILSVRC_INCEPTION_PREPROCESSING_H_
 
 #include <utility>
 
-#include "tensorflow/lite/tools/accuracy/stage.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/lite/tools/accuracy/stage.h"
 
 namespace tensorflow {
 namespace metrics {
@@ -31,28 +31,53 @@ namespace metrics {
 // shape {1, image_height, image_width, 3}, where 3 is the number of channels.
 class InceptionPreprocessingStage : public Stage {
  public:
+  // Preprocessing params that govern scaling and normalization of channels of
+  // the image.
   struct Params {
+    // Input means are subtracted from each channel.
+    // In case of an empty vector this is skipped.
     std::vector<float> input_means;
+    // Scale is used to divide the input.
+    // A scale of 0 means divison is skipped.
     float scale;
     double cropping_fraction;
   };
 
-  static Params DefaultParams() {
-    return {.input_means = {127.5, 127.5, 127.5},
-            .scale = 127.5,
-            .cropping_fraction = 0.875};
+  // Default preprocessing for inception stage based on |output_type|
+  static Params DefaultParamsForType(DataType output_type) {
+    const float kCroppingFraction = 0.875;
+    Params params = {};
+    params.cropping_fraction = kCroppingFraction;
+    if (output_type == DT_UINT8) {
+    } else if (output_type == DT_INT8) {
+      params.input_means = {128.0, 128.0, 128.0};
+    } else {
+      // Assume floating point preprocessing.
+      params.input_means = {127.5, 127.5, 127.5};
+      params.scale = 127.5;
+    }
+    return params;
+  }
+
+  // Creates a new preprocessing stage object with provided |image_width|
+  // |image_height| as the size of output image.
+  // |output_datatype| is the datatype of output of the stage.
+  InceptionPreprocessingStage(int image_width, int image_height,
+                              DataType output_datatype)
+      : output_datatype_(output_datatype),
+        image_width_(image_width),
+        image_height_(image_height) {
+    params_ = DefaultParamsForType(output_datatype);
   }
 
   // Creates a new preprocessing stage object with provided |image_width|
   // |image_height| as the size of output image.
-  // If |is_quantized| is set to true then |params| is ignored since quantized
-  // images don't go through any preprocessing.
+  // |output_datatype| is the datatype of output of the stage.
   InceptionPreprocessingStage(int image_width, int image_height,
-                              bool is_quantized,
-                              Params params = DefaultParams())
-      : image_width_(image_width),
+                              DataType output_datatype, Params params)
+      : output_datatype_(output_datatype),
+        image_width_(image_width),
         image_height_(image_height),
-        is_quantized_(is_quantized),
         params_(std::move(params)) {}
 
   string name() const override { return "stage_inception_preprocess"; }
@@ -63,6 +88,7 @@ class InceptionPreprocessingStage : public Stage {
   void AddToGraph(const Scope& scope, const Input& input) override;
 
  private:
+  DataType output_datatype_;
   int image_width_;
   int image_height_;
   bool is_quantized_;
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc
index 5d0e01d7d18c451b978edbd08fc27934c8379961..f88847035f21ee41eb7403aae99c9d7db1484499 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc
@@ -17,10 +17,10 @@ limitations under the License.
 #include <string>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h"
 
 namespace {
 tensorflow::string* g_test_image_file = nullptr;
@@ -48,7 +48,7 @@ Status GetContents(const string& filename, string* output) {
   }
 }
 
-TEST(InceptionPreprocessingTest, TestImagePreprocessQuantized) {
+TEST(InceptionPreprocessingTest, TestImagePreprocessUInt8Quantized) {
   ASSERT_TRUE(g_test_image_file != nullptr);
   string image_contents;
   string image_path = *g_test_image_file;
@@ -56,8 +56,8 @@ TEST(InceptionPreprocessingTest, TestImagePreprocessQuantized) {
   ASSERT_TRUE(status.ok()) << status.error_message();
   const int width = 224;
   const int height = 224;
-  const bool is_quantized = true;
-  InceptionPreprocessingStage preprocess_stage(width, height, is_quantized);
+  auto params = InceptionPreprocessingStage::DefaultParamsForType(DT_UINT8);
+  InceptionPreprocessingStage preprocess_stage(width, height, DT_UINT8, params);
   Scope scope = Scope::NewRootScope();
   preprocess_stage.AddToGraph(scope, image_contents);
   TF_CHECK_OK(scope.status());
@@ -77,6 +77,35 @@ TEST(InceptionPreprocessingTest, TestImagePreprocessQuantized) {
   EXPECT_TRUE(outputs[0].shape().IsSameSize({1, 224, 224, 3}));
 }
 
+TEST(InceptionPreprocessingTest, TestImagePreprocessInt8Quantized) {
+  ASSERT_TRUE(g_test_image_file != nullptr);
+  string image_contents;
+  string image_path = *g_test_image_file;
+  auto status = GetContents(image_path, &image_contents);
+  ASSERT_TRUE(status.ok()) << status.error_message();
+  const int width = 224;
+  const int height = 224;
+  auto params = InceptionPreprocessingStage::DefaultParamsForType(DT_INT8);
+  InceptionPreprocessingStage preprocess_stage(width, height, DT_INT8, params);
+  Scope scope = Scope::NewRootScope();
+  preprocess_stage.AddToGraph(scope, image_contents);
+  TF_CHECK_OK(scope.status());
+
+  GraphDef graph_def;
+  TF_CHECK_OK(scope.ToGraphDef(&graph_def));
+  std::unique_ptr<Session> session(NewSession(SessionOptions()));
+  TF_CHECK_OK(session->Create(graph_def));
+  std::vector<Tensor> outputs;
+  auto run_status =
+      session->Run({},                                   /*inputs*/
+                   {preprocess_stage.output_name()}, {}, /*target node names */
+                   &outputs);
+  TF_CHECK_OK(run_status);
+  EXPECT_EQ(1, outputs.size());
+  EXPECT_EQ(DT_INT8, outputs[0].dtype());
+  EXPECT_TRUE(outputs[0].shape().IsSameSize({1, 224, 224, 3}));
+}
+
 TEST(InceptionPreprocessingTest, TestImagePreprocessFloat) {
   ASSERT_TRUE(g_test_image_file != nullptr);
   string image_contents;
@@ -85,8 +114,8 @@ TEST(InceptionPreprocessingTest, TestImagePreprocessFloat) {
   ASSERT_TRUE(status.ok()) << status.error_message();
   const int width = 224;
   const int height = 224;
-  const bool is_quantized = false;
-  InceptionPreprocessingStage preprocess_stage(width, height, is_quantized);
+  auto params = InceptionPreprocessingStage::DefaultParamsForType(DT_FLOAT);
+  InceptionPreprocessingStage preprocess_stage(width, height, DT_FLOAT, params);
   Scope scope = Scope::NewRootScope();
   preprocess_stage.AddToGraph(scope, image_contents);
   TF_CHECK_OK(scope.status());
diff --git a/tensorflow/lite/tools/accuracy/utils.cc b/tensorflow/lite/tools/accuracy/utils.cc
index c19dc1ff7cca10745a367c027bef1067d117eb4a..953892b8ddff2e60d2e1618df97d867b2d553c29 100644
--- a/tensorflow/lite/tools/accuracy/utils.cc
+++ b/tensorflow/lite/tools/accuracy/utils.cc
@@ -38,6 +38,12 @@ DataType GetTFDataType(TfLiteType tflite_type) {
       return DT_FLOAT;
     case kTfLiteUInt8:
       return DT_UINT8;
+    case kTfLiteInt8:
+      return DT_INT8;
+    case kTfLiteInt32:
+      return DT_INT32;
+    case kTfLiteInt64:
+      return DT_INT64;
     default:
       return DT_INVALID;
   }
diff --git a/tensorflow/lite/tools/optimize/calibration_common.h b/tensorflow/lite/tools/optimize/calibration_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ff2d3f18a66ca4323727b8403515e857e54d8cc
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration_common.h
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_COMMON_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_COMMON_H_
+
+#include <unordered_map>
+#include <unordered_set>
+
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+using BuiltinOperatorKey = std::pair<BuiltinOperator, int>;
+
+using BuiltinOpsSet = std::unordered_set<
+    BuiltinOperatorKey,
+    op_resolver_hasher::OperatorKeyHasher<BuiltinOperatorKey>>;
+
+template <typename T>
+class BuiltinOpsMap
+    : public std::unordered_map<
+          BuiltinOperatorKey, T,
+          op_resolver_hasher::OperatorKeyHasher<BuiltinOperatorKey>> {};
+
+// An alias for |TfLiteRegistration.invoke|.
+using KernelEvalFuncPtr = TfLiteStatus (*)(TfLiteContext*, TfLiteNode*);
+
+enum class OperatorTensorType { kNone, kInput, kOutput, kIntermediate };
+
+// Information about an operator in the TfLite graph.
+struct OperatorInfo {
+  int node_index;
+  std::string name;
+  BuiltinOperator builtin_op_code;
+  bool is_custom_op;
+  std::vector<int> inputs;
+  std::vector<int> outputs;
+  // Inputs that need to be logged.
+  std::vector<int> loggable_inputs;
+  // Outputs that need to be logged.
+  std::vector<int> loggable_outputs;
+  const TfLiteRegistration* registration;
+};
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_COMMON_H_
diff --git a/tensorflow/lite/tools/optimize/calibration_logger.h b/tensorflow/lite/tools/optimize/calibration_logger.h
new file mode 100644
index 0000000000000000000000000000000000000000..8fd380423a3ee0e671fcedd5c3e2cdf566c993eb
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration_logger.h
@@ -0,0 +1,85 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_LOGGER_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_LOGGER_H_
+
+#include <unordered_map>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+class MinMax {
+ public:
+  void Update(const float* values, size_t tensor_size) {
+    // TODO(shashishekhar): Really slow implementation, optimize
+    if (tensor_size <= 0) return;
+
+    if (!has_values_) {
+      min_ = max_ = values[0];
+      has_values_ = true;
+      return;
+    }
+
+    // We are only logging absolute min/max here.
+    // TODO(shashishekhar): Make it possible to use weighted/moving average.
+    for (size_t i = 0; i < tensor_size; i++) {
+      float val = values[i];
+      if (min_ > val) {
+        min_ = val;
+      } else if (max_ < val) {
+        max_ = val;
+      }
+    }
+  }
+
+  bool HasValues() const { return has_values_; }
+
+  TfLiteStatus Get(float* min_val, float* max_val) const {
+    if (!has_values_) return kTfLiteError;
+    *min_val = min_;
+    *max_val = max_;
+    return kTfLiteOk;
+  }
+
+ private:
+  bool has_values_;
+  float min_, max_;
+};
+
+// Captures min max values for tensors.
+class Logger {
+ public:
+  // Log the value for tensor at |tensor_index| which has |tensor_values|
+  void LogTensorValue(int tensor_index, const float* tensor_values,
+                      size_t tensor_size) {
+    tensor_id_to_stats_map_[tensor_index].Update(tensor_values, tensor_size);
+  }
+
+  // Returns a map from tensor_index -> observed min max values.
+  const std::unordered_map<int, MinMax>& GetCalibrationValues() const {
+    return tensor_id_to_stats_map_;
+  }
+
+ private:
+  std::unordered_map<int, MinMax> tensor_id_to_stats_map_;
+};
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_LOGGER_H_
diff --git a/tensorflow/lite/tools/optimize/calibration_reader.cc b/tensorflow/lite/tools/optimize/calibration_reader.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b01a62bd6c15dee5b60edf5f3abdd40ba4c3a56b
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration_reader.cc
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/calibration_reader.h"
+
+#include "absl/memory/memory.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+TfLiteStatus CalibrationReader::GetTensorStatsAsMap(
+    std::unordered_map<int, CalibrationStats>* tensor_id_to_stats_map) const {
+  tensor_id_to_stats_map->clear();
+  for (const auto& tensorid_stat : logger_->GetCalibrationValues()) {
+    auto minmax = tensorid_stat.second;
+    CalibrationReader::CalibrationStats stats;
+    TF_LITE_ENSURE_STATUS(minmax.Get(&stats.min, &stats.max));
+    tensor_id_to_stats_map->insert({tensorid_stat.first, stats});
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus CalibrationReader::AddCalibrationToModel(ModelT* model) const {
+  if (!model || model->subgraphs.empty()) {
+    return kTfLiteError;
+  }
+  const auto& subgraph = model->subgraphs[0];
+  for (const auto& tensorid_stat : logger_->GetCalibrationValues()) {
+    auto minmax = tensorid_stat.second;
+    float min, max;
+    TF_LITE_ENSURE_STATUS(minmax.Get(&min, &max));
+    auto quant_params = absl::make_unique<tflite::QuantizationParametersT>();
+    quant_params->min.push_back(min);
+    quant_params->max.push_back(max);
+    subgraph->tensors[tensorid_stat.first]->quantization =
+        std::move(quant_params);
+  }
+
+  return kTfLiteOk;
+}
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/calibration_reader.h b/tensorflow/lite/tools/optimize/calibration_reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..af0da1bb3835493e69ef7a6bccb7149ef14b1db9
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration_reader.h
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_READER_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_READER_H_
+
+#include <unordered_map>
+
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/tools/optimize/calibration_logger.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+// Warning: This is not a public API and subject to change.
+//
+// Reads calibrator data collected by running the interpreter through
+// a calibration set.
+class CalibrationReader {
+ public:
+  struct CalibrationStats {
+    float min;
+    float max;
+  };
+  explicit CalibrationReader(const Logger* logger) : logger_(logger) {}
+
+  // Gets a map from tensor index to recorded calibration values.
+  virtual TfLiteStatus GetTensorStatsAsMap(
+      std::unordered_map<int, CalibrationStats>* tensor_id_to_stats_map) const;
+
+  // Annotates the tensors in the given model with statistics captured during
+  // calibration.
+  virtual TfLiteStatus AddCalibrationToModel(ModelT* model) const;
+
+  virtual ~CalibrationReader() {}
+
+ private:
+  const Logger* logger_;
+};
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_READER_H_
diff --git a/tensorflow/lite/tools/optimize/calibrator.cc b/tensorflow/lite/tools/optimize/calibrator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0e817f934618ba7759d23e8a038653834488d2cc
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibrator.cc
@@ -0,0 +1,345 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/calibrator.h"
+
+#include <fstream>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_util.h"
+#include "tensorflow/lite/tools/optimize/calibration_common.h"
+#include "tensorflow/lite/tools/optimize/calibration_logger.h"
+#include "tensorflow/lite/tools/optimize/calibration_reader.h"
+#include "tensorflow/lite/tools/optimize/logging_op_resolver.h"
+#include "tensorflow/lite/tools/optimize/node_info_delegate.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+
+namespace {
+
+// Calibrator is used to hold information that can be accessed during kernel
+// invocations.
+// TfLite kernel invocations are C functions and cannot look at the global
+// structure of the graph. Calibrator allows the kernel invoke functions to
+// access the global structure of graph and know which node is currently being
+// executed. This also allows us to write a simple kernel invoke wrapper
+// (see LoggingEval) that can work for most builtin ops.
+class Calibrator {
+ public:
+  Calibrator(const std::unordered_map<const TfLiteNode*, OperatorInfo>&
+                 node_ptr_opinfo_map,
+             std::unique_ptr<LoggingOpResolver> logging_op_resolver)
+      : node_ptr_opinfo_map_(node_ptr_opinfo_map),
+        logging_op_resolver_(std::move(logging_op_resolver)) {
+    logger_ = absl::make_unique<Logger>();
+  }
+
+  // Returns the wrapped kernel invoke function |TfLiteRegistration.invoke|.
+  KernelEvalFuncPtr GetKernelInvoke(const TfLiteNode* node) const;
+
+  // Gets the instance of logger associated with the current context.
+  Logger* GetLogger() const { return logger_.get(); }
+
+  // Gets the operator information about the given TfLiteNode.
+  const OperatorInfo& GetOpInfo(const TfLiteNode* node) const {
+    return node_ptr_opinfo_map_.at(node);
+  }
+
+ private:
+  std::unordered_map<const TfLiteNode*, OperatorInfo> node_ptr_opinfo_map_;
+  std::unique_ptr<LoggingOpResolver> logging_op_resolver_;
+  const std::unordered_map<int, OperatorInfo> index_opinfo_;
+  std::unique_ptr<Logger> logger_;
+};
+
+KernelEvalFuncPtr Calibrator::GetKernelInvoke(const TfLiteNode* node) const {
+  auto op_info = node_ptr_opinfo_map_.at(node);
+  return logging_op_resolver_->GetWrappedKernelInvoke(op_info.builtin_op_code,
+                                                      1);
+}
+
+// A registry of |Calibrator| objects per |TfLiteContext|.
+// This global registry is needed to access |Calibrator| objects in the kernel
+// invoke functions i.e. |TfLiteRegistration.invoke|.
+// Kernel invoke functions are C functions that have limited access to
+// |TfLiteContext|. Kernel invoke functions don't have access to global state of
+// graph. That means during a kernel invocation, the function cannot know which
+// node it was invoked for. E.g. in case of a model with |Conv| op at two
+// locations, there is no easy way for the Conv.invoke function to disambiguate
+// the calls.
+//
+// For calibration we solve this problem by creating a map of calibrators
+// per |TfLiteContext|. This map is |GlobalCalibrationRegistry|.
+//
+// This registry is then accessed using a global getter function:
+// |GetCalibratorRegistry|.
+// E.g.
+// TfLiteStatus SomeKernelInvokeFn(TfLiteContext* context, TfLiteNode* node) {
+//   .... code ....
+//   auto registry = GetCalibratorRegistry();
+//   auto calibrator = registry->GetCalibrator(context);
+//   ..... code ....
+//  }
+//
+// This way the kernel invoke functions can get the access to the Calibrator
+// object associated with the |TfLiteContext|.
+class GlobalCalibratorRegistry {
+ public:
+  // Get the |Calibrator| associated with given context, returns null if no
+  // calibrator is associated with the given context.
+  Calibrator* GetCalibrator(const TfLiteContext* context) const {
+    if (calibrator_registry_.find(context) == calibrator_registry_.cend()) {
+      return nullptr;
+    }
+    return calibrator_registry_.at(context).get();
+  }
+
+  // Removes the association between calibrator and context.
+  // Note: This deletes the calibrator as well.
+  void RemoveCalibrator(const TfLiteContext* context) {
+    calibrator_registry_.erase(context);
+  }
+
+  // Creates an instance of |Calibrator|.
+  // Registry owns the |Calibrator| object which can be deleted by calling
+  // |RemoveCalibrator|.
+  TfLiteStatus CreateCalibrator(
+      const TfLiteContext* context,
+      const std::unordered_map<const TfLiteNode*, OperatorInfo>& node_to_opinfo,
+      std::unique_ptr<LoggingOpResolver> logging_op_resolver,
+      Calibrator** calibrator_ptr, ErrorReporter* reporter) {
+    if (calibrator_registry_.find(context) != calibrator_registry_.cend()) {
+      reporter->Report(
+          "Failed to create calibrator, context already registered.");
+      return kTfLiteError;
+    }
+    std::unique_ptr<Calibrator> calibrator = absl::make_unique<Calibrator>(
+        node_to_opinfo, std::move(logging_op_resolver));
+    calibrator_registry_[context] = std::move(calibrator);
+    *calibrator_ptr = calibrator_registry_.at(context).get();
+    return kTfLiteOk;
+  }
+
+ private:
+  std::unordered_map<const TfLiteContext*, std::unique_ptr<Calibrator>>
+      calibrator_registry_;
+};
+
+GlobalCalibratorRegistry* GetCalibratorRegistry() {
+  static GlobalCalibratorRegistry* registry = new GlobalCalibratorRegistry();
+  return registry;
+}
+
+// A wrapper implementation for |TfLiteRegistration.invoke| that logs inputs,
+// invokes the wrapped implementation and then logs the outputs.
+TfLiteStatus LoggingEval(TfLiteContext* context, TfLiteNode* node) {
+  Calibrator* calibrator = GetCalibratorRegistry()->GetCalibrator(context);
+
+  if (!calibrator) {
+    context->ReportError(context, "No calibrator found for context.");
+    return kTfLiteError;
+  }
+
+  auto kernel_invoke = calibrator->GetKernelInvoke(node);
+  auto logger = calibrator->GetLogger();
+  auto op_info = calibrator->GetOpInfo(node);
+
+  for (int i : op_info.loggable_inputs) {
+    auto tensor = context->tensors[i];
+    logger->LogTensorValue(i, tensor.data.f, tensor.bytes / sizeof(float));
+  }
+
+  auto status = kernel_invoke(context, node);
+  // TODO(shashishekhar): An intermediate tensor in graph will get logged twice
+  // once as an input and second time as output. This doesn't change the min max
+  // values but is inefficient.
+  // Using moving average will also break this.
+
+  for (int i : op_info.loggable_outputs) {
+    auto tensor = context->tensors[i];
+    logger->LogTensorValue(i, tensor.data.f, tensor.bytes / sizeof(float));
+  }
+
+  return status;
+}
+
+// Returns the loggable tensors. Not all inputs and outputs need to be logged.
+// For example, const weight tensors which have buffers associated with them
+// don't need to be logged.
+std::vector<int> GetLoggableTensorIndices(
+    const std::vector<int>& tensor_indices,
+    const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors,
+    const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* tensor_buffers) {
+  std::vector<int> loggable;
+  for (auto tensor_index : tensor_indices) {
+    auto tensor = tensors->Get(tensor_index);
+    auto buffer_index = tensor->buffer();
+    bool has_no_buffer =
+        buffer_index == 0 || (tensor_buffers->Get(buffer_index) == nullptr);
+    if (has_no_buffer && tensor->type() == tflite::TensorType_FLOAT32) {
+      loggable.push_back(tensor_index);
+    }
+  }
+  return loggable;
+}
+
+// Creates a mapping between the static model graph and the runtime TfLiteNode*
+// nodes in the graph for the given context.
+// This is done by querying the TfLiteContext for node and registrations using
+// the |NodeInfoDelegateObserver|.
+TfLiteStatus GetNodeOpInfoMapAndContext(
+    const std::unordered_map<int, OperatorInfo>& node_to_opinfo,
+    tflite::Interpreter* const interpreter,
+    std::unordered_map<const TfLiteNode*, OperatorInfo>* node_ptr_opinfo_map,
+    const TfLiteContext** context
+
+) {
+  NodeInfoDelegateObserver delegate_observer(node_to_opinfo,
+                                             node_ptr_opinfo_map);
+  NodeInfoDelegateParams delegate_params;
+  delegate_params.delegate_observer = &delegate_observer;
+  TfLiteDelegate logging_delegate = CreateNodeInfoDelegate(&delegate_params);
+
+  auto modify_status = interpreter->ModifyGraphWithDelegate(&logging_delegate);
+  if (modify_status != kTfLiteOk) {
+    return kTfLiteError;
+  }
+  *context = delegate_observer.GetContext();
+  return kTfLiteOk;
+}
+
+string GetOpName(const tflite::OperatorCode& opcode) {
+  if (opcode.custom_code() != nullptr) {
+    return opcode.custom_code()->str();
+  }
+  return tflite::EnumNamesBuiltinOperator()[opcode.builtin_code()];
+}
+
+// A |CalibrationReader| that owns the Calibrator.
+class Reader : public CalibrationReader {
+ public:
+  Reader(const TfLiteContext* context, const Logger* logger)
+      : CalibrationReader(logger), context_(context) {}
+
+  ~Reader() override { GetCalibratorRegistry()->RemoveCalibrator(context_); }
+
+ private:
+  const TfLiteContext* context_;
+};
+
+}  // namespace
+
+TfLiteStatus BuildLoggingInterpreter(
+    const FlatBufferModel& model, const OpResolver& op_resolver,
+    std::unique_ptr<Interpreter>* interpreter,
+    std::unique_ptr<CalibrationReader>* calibration_reader) {
+  auto tflite_model = model.GetModel();
+  auto subgraphs = tflite_model->subgraphs();
+  auto tensor_buffers = tflite_model->buffers();
+
+  if (subgraphs->size() != 1) {
+    model.error_reporter()->Report(
+        "Only models with a single subgraph are supported, model had %d "
+        "subgraphs",
+        subgraphs->size());
+    return kTfLiteError;
+  }
+
+  // Populate the node index to operator info map.
+  // We want to collect this information so we can use it during runtime to
+  // log details of which inputs and outputs.
+  // At runtime TFLite kernel invoke functions can only look into their
+  // own node in the graph (TFLiteNode*) and some limited context information.
+  auto primary_subgraph = subgraphs->Get(0);
+  auto operator_codes = tflite_model->operator_codes();
+  auto operators = primary_subgraph->operators();
+  auto tensors = primary_subgraph->tensors();
+  std::unordered_map<int, OperatorInfo> node_to_opinfo;
+  BuiltinOpsSet op_and_versions;
+
+  for (size_t i = 0; i < operators->size(); i++) {
+    OperatorInfo op_info;
+    op_info.node_index = i;
+    auto op = operators->Get(i);
+    auto operator_code = operator_codes->Get(op->opcode_index());
+    op_info.builtin_op_code = operator_code->builtin_code();
+    op_info.name = GetOpName(*operator_code);
+    op_info.is_custom_op = operator_code->custom_code() != nullptr;
+
+    auto op_inputs = op->inputs();
+    auto op_outputs = op->outputs();
+    op_info.inputs = std::vector<int>(op_inputs->begin(), op_inputs->end());
+    op_info.outputs = std::vector<int>(op_outputs->begin(), op_outputs->end());
+    op_info.loggable_inputs =
+        GetLoggableTensorIndices(op_info.inputs, tensors, tensor_buffers);
+    op_info.loggable_outputs =
+        GetLoggableTensorIndices(op_info.outputs, tensors, tensor_buffers);
+    if (!op_info.is_custom_op) {
+      op_info.registration = op_resolver.FindOp(operator_code->builtin_code(),
+                                                operator_code->version());
+    } else {
+      op_info.registration =
+          op_resolver.FindOp(op_info.name.c_str(), operator_code->version());
+    }
+    node_to_opinfo[i] = op_info;
+    op_and_versions.insert({op_info.builtin_op_code, operator_code->version()});
+  }
+
+  // Prepare the logging op resolver to use |LoggingEval| for kernel
+  // invocations.
+  auto logging_op_resolver = absl::make_unique<LoggingOpResolver>(
+      op_and_versions, op_resolver, LoggingEval);
+  tflite::InterpreterBuilder(model, *logging_op_resolver)(interpreter);
+
+  if (!(*interpreter)) {
+    model.error_reporter()->Report("Failed to construct interpreter");
+    return kTfLiteError;
+  }
+
+  // Compute the mapping between runtime and static graph structure, i.e.
+  // (TfLiteContext, TfLiteNode) -> OperatorInfo
+  std::unordered_map<const TfLiteNode*, OperatorInfo> node_ptr_opinfo_map;
+  const TfLiteContext* context = nullptr;
+  GetNodeOpInfoMapAndContext(node_to_opinfo, interpreter->get(),
+                             &node_ptr_opinfo_map, &context);
+
+  Calibrator* calibrator = nullptr;
+  // Register a calibrator object for the context. This can be accessed
+  // during invocations by the logging kernels.
+  TF_LITE_ENSURE_STATUS(GetCalibratorRegistry()->CreateCalibrator(
+      context, node_ptr_opinfo_map, std::move(logging_op_resolver), &calibrator,
+      model.error_reporter()));
+  *calibration_reader = std::unique_ptr<CalibrationReader>(
+      new Reader(context, calibrator->GetLogger()));
+
+  return kTfLiteOk;
+}
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/calibrator.h b/tensorflow/lite/tools/optimize/calibrator.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab3cb27eb7518b7327655023739e310e2a6b0249
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibrator.h
@@ -0,0 +1,64 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATOR_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATOR_H_
+
+#include <unordered_map>
+
+#include "flatbuffers/flatbuffers.h"  // TF:flatbuffers
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/tools/optimize/calibration_reader.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+
+// Warning: This is not a public API and subject to change.
+
+// Builds a interpreter that logs the calibration data in memory.
+// The calibration data can be recovered using |calibration_reader|.
+//
+// Sample usage:
+// std::unique_ptr<Interpreter> interpreter;
+// std::unique_ptr<CalibrationReader> calibration_reader;
+// BuiltinOpResolver resolver = ...
+// FlatBufferModel model = ..
+//
+// BuildLoggingInterpreter(model, resolver, &interpreter,
+//  &calibration_reader);
+//
+//
+// * Allocate tensors...
+// * Call interpreter->invoke on calibration dataset.
+//
+// Calibration data can be read either directly by calling
+// std::unordered_map<int,  CalibrationStats>> tensor_index_to_stats;
+// calibration_reader->GetTensorStatsAsMap(&tensor_index_to_stats);
+//
+// or adding calibration data to model itself.
+// ModelT * original_floating_point_model = ...
+// calibration_reader->AddCalibrationToModel(original_floating_point_model);
+//
+TfLiteStatus BuildLoggingInterpreter(
+    const FlatBufferModel& model, const OpResolver& op_resolver,
+    std::unique_ptr<Interpreter>* interpreter,
+    std::unique_ptr<CalibrationReader>* calibration_reader);
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATOR_H_
diff --git a/tensorflow/lite/tools/optimize/calibrator_test.cc b/tensorflow/lite/tools/optimize/calibrator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bbbcc70fae1a775cf49bedd809799d3472e3d060
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibrator_test.cc
@@ -0,0 +1,189 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstring>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/tools/optimize/calibrator.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+namespace {
+
+TEST(CalibratorTest, CalibrationStatsAreCollected) {
+  auto model = FlatBufferModel::BuildFromFile(
+      "third_party/tensorflow/lite/testdata/multi_add.bin");
+  ASSERT_TRUE(model);
+  std::unique_ptr<Interpreter> interpreter;
+  std::unique_ptr<CalibrationReader> reader;
+  auto status = BuildLoggingInterpreter(
+      *model, ops::builtin::BuiltinOpResolver{}, &interpreter, &reader);
+  EXPECT_EQ(kTfLiteOk, status);
+
+  ASSERT_TRUE(interpreter);
+  ASSERT_TRUE(reader);
+  std::unordered_map<int, CalibrationReader::CalibrationStats> stats;
+  status = reader->GetTensorStatsAsMap(&stats);
+  EXPECT_EQ(kTfLiteOk, status);
+  EXPECT_TRUE(stats.empty());
+
+  status = interpreter->AllocateTensors();
+  ASSERT_EQ(kTfLiteOk, status);
+  // Model does the following:
+  // 0        1       2        3
+  // |        |__ ____|        |
+  // |           |             |
+  // |          Add(tensor:4)  |
+  // |____ ______|______ ______|
+  //      |             |
+  //      Add          Add
+  //      |             |
+  //    Output:5      Output:6
+
+  const size_t tensor_size = 1 * 8 * 8 * 3;
+
+  std::vector<float> ones(tensor_size, 1.0f);
+  // Fill input tensor i with i+1, i.e. input[0] = 1.0f, input[1] = 2.0f,
+  // input[2] = 3.0f
+
+  for (size_t i = 0; i < interpreter->inputs().size(); i++) {
+    int input_tensor_idx = interpreter->inputs()[i];
+    TfLiteTensor* tensor = interpreter->tensor(input_tensor_idx);
+    ASSERT_EQ(tensor->bytes, tensor_size * sizeof(float));
+    for (size_t j = 0; j < tensor_size; j++) {
+      tensor->data.f[j] = i + 1;
+    }
+  }
+  status = interpreter->Invoke();
+  ASSERT_EQ(kTfLiteOk, status);
+  const float eps = 1e-6f;
+  // Verify that tensor 5: is 6
+  // Verify that tensor 6: is 9
+  TfLiteTensor* tensor = interpreter->tensor(interpreter->outputs()[0]);
+  for (size_t i = 0; i < tensor_size; i++) {
+    EXPECT_NEAR(tensor->data.f[i], 6.0f, eps);
+  }
+  tensor = interpreter->tensor(interpreter->outputs()[1]);
+  for (size_t i = 0; i < tensor_size; i++) {
+    EXPECT_NEAR(tensor->data.f[i], 9.0f, eps);
+  }
+
+  // Verify that min max of tensors.
+  status = reader->GetTensorStatsAsMap(&stats);
+  EXPECT_EQ(kTfLiteOk, status);
+  EXPECT_EQ(7, stats.size());
+  // Check inputs
+  for (int tensor_idx = 0; tensor_idx < 4; tensor_idx++) {
+    EXPECT_NEAR(stats.at(tensor_idx).min, tensor_idx + 1, eps);
+    EXPECT_NEAR(stats.at(tensor_idx).max, tensor_idx + 1, eps);
+  }
+  // Check tensor 4 max.
+  EXPECT_NEAR(stats.at(4).min, 5, eps);
+  EXPECT_NEAR(stats.at(4).max, 5, eps);
+
+  // Check outputs
+  EXPECT_NEAR(stats.at(5).min, 6, eps);
+  EXPECT_NEAR(stats.at(5).max, 6, eps);
+
+  EXPECT_NEAR(stats.at(6).min, 9, eps);
+  EXPECT_NEAR(stats.at(6).max, 9, eps);
+}
+
+TEST(CalibratorTest, MultipleInvokes) {
+  auto model = FlatBufferModel::BuildFromFile(
+      "third_party/tensorflow/lite/testdata/multi_add.bin");
+  ASSERT_TRUE(model);
+  std::unique_ptr<Interpreter> interpreter;
+  std::unique_ptr<CalibrationReader> reader;
+  auto status = BuildLoggingInterpreter(
+      *model, ops::builtin::BuiltinOpResolver{}, &interpreter, &reader);
+  EXPECT_EQ(kTfLiteOk, status);
+
+  ASSERT_TRUE(interpreter);
+  ASSERT_TRUE(reader);
+  status = interpreter->AllocateTensors();
+
+  EXPECT_EQ(kTfLiteOk, status);
+  const size_t tensor_size = 1 * 8 * 8 * 3;
+  // Fill input tensor i with i+1, i.e. input[0] = 1.0f, input[1] = 2.0f,
+  // input[2] = 3.0f
+
+  for (size_t i = 0; i < interpreter->inputs().size(); i++) {
+    int input_tensor_idx = interpreter->inputs()[i];
+    TfLiteTensor* tensor = interpreter->tensor(input_tensor_idx);
+    ASSERT_EQ(tensor->bytes, tensor_size * sizeof(float));
+    for (size_t j = 0; j < tensor_size; j++) {
+      tensor->data.f[j] = i + 1;
+    }
+  }
+  status = interpreter->Invoke();
+  ASSERT_EQ(kTfLiteOk, status);
+  const float eps = 1e-6f;
+  // Verify that min max of tensors.
+  std::unordered_map<int, CalibrationReader::CalibrationStats> stats;
+  status = reader->GetTensorStatsAsMap(&stats);
+  EXPECT_EQ(kTfLiteOk, status);
+  EXPECT_EQ(7, stats.size());
+  const float expected_values[7] = {
+      1.0f,  // input 0
+      2.0f,  // input 1
+      3.0f,  // input 2
+      4.0f,  // input 3
+      5.0f,  // Add(1, 2)
+      6.0f,  // Output 5: Add(0, Add(1,2))
+      9.0f,  // Output 6: Add(Add(1,2), 3)
+  };
+  for (int tensor_idx = 0; tensor_idx < 7; tensor_idx++) {
+    EXPECT_NEAR(stats.at(tensor_idx).min, expected_values[tensor_idx], eps);
+    EXPECT_NEAR(stats.at(tensor_idx).max, expected_values[tensor_idx], eps);
+  }
+  // Set input[0][0] = 1.5 and input[0][1] = 0.5 this should change the values
+  // only for input[0] and tensor 4 and ouputs 5, 6.
+  TfLiteTensor* input0 = interpreter->tensor(0);
+  input0->data.f[0] = 1.5f;
+  input0->data.f[1] = 0.5f;
+  status = interpreter->Invoke();
+  ASSERT_EQ(kTfLiteOk, status);
+  status = reader->GetTensorStatsAsMap(&stats);
+  EXPECT_EQ(kTfLiteOk, status);
+  EXPECT_EQ(7, stats.size());
+  EXPECT_NEAR(stats.at(0).min, 0.5f, eps);
+  EXPECT_NEAR(stats.at(0).max, 1.5f, eps);
+
+  for (int tensor_idx = 1; tensor_idx < 5; tensor_idx++) {
+    EXPECT_NEAR(stats.at(tensor_idx).min, expected_values[tensor_idx], eps);
+    EXPECT_NEAR(stats.at(tensor_idx).max, expected_values[tensor_idx], eps);
+  }
+
+  EXPECT_NEAR(stats.at(5).min, 5.5f, eps);
+  EXPECT_NEAR(stats.at(5).max, 6.5f, eps);
+
+  EXPECT_NEAR(stats.at(6).min, 9.0f, eps);
+  EXPECT_NEAR(stats.at(6).max, 9.0f, eps);
+}
+
+}  // namespace
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  // On Linux, add: FLAGS_logtostderr = true;
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/tools/optimize/logging_op_resolver.cc b/tensorflow/lite/tools/optimize/logging_op_resolver.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7633ebb8dd9d7aee0b8a5befa5d51911f68a7e32
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/logging_op_resolver.cc
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/logging_op_resolver.h"
+
+#include "absl/memory/memory.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+
+LoggingOpResolver::LoggingOpResolver(const BuiltinOpsSet& ops_to_replace,
+                                     const OpResolver& base_resolver,
+                                     KernelEvalFuncPtr logging_eval_fn) {
+  for (const auto& op_and_version : ops_to_replace) {
+    const TfLiteRegistration* base_registration =
+        base_resolver.FindOp(op_and_version.first, op_and_version.second);
+    BuiltinOperatorKey key = op_and_version;
+    builtin_op_evalfn_map_[key] = base_registration->invoke;
+    std::unique_ptr<TfLiteRegistration> logging_registation =
+        absl::make_unique<TfLiteRegistration>(*base_registration);
+    logging_registation->invoke = logging_eval_fn;
+    builtin_op_registration_map_[key] = std::move(logging_registation);
+  }
+}
+
+const TfLiteRegistration* LoggingOpResolver::FindOp(BuiltinOperator op,
+                                                    int version) const {
+  BuiltinOperatorKey key = {op, version};
+  if (builtin_op_registration_map_.find(key) !=
+      builtin_op_registration_map_.end()) {
+    return builtin_op_registration_map_.at(key).get();
+  }
+
+  return nullptr;
+}
+
+KernelEvalFuncPtr LoggingOpResolver::GetWrappedKernelInvoke(BuiltinOperator op,
+                                                            int version) const {
+  return builtin_op_evalfn_map_.at({op, version});
+}
+
+const TfLiteRegistration* LoggingOpResolver::FindOp(const char* op,
+                                                    int version) const {
+  // TODO(b/121374947): Support custom ops as well.
+  return nullptr;
+}
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/logging_op_resolver.h b/tensorflow/lite/tools/optimize/logging_op_resolver.h
new file mode 100644
index 0000000000000000000000000000000000000000..58a3a0fe3c08288ccba6881a64b1fd581103da10
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/logging_op_resolver.h
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_LOGGING_OP_RESOLVER_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_LOGGING_OP_RESOLVER_H_
+
+#include <set>
+#include <unordered_map>
+
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
+#include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/lite/tools/optimize/calibration_common.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+// A resolver that replaces the kernel invocations with a wrapper
+// eval function.
+class LoggingOpResolver : public OpResolver {
+ public:
+  // Creates an instance of |LoggingOpResolver|.
+  // All |TfLiteRegistration.invoke| functions are replaced by
+  // |logging_eval_fn|.
+  // TODO(shashishekhar): This interface needs to change for custom ops and
+  // BuiltinOps that need special logging implementations.
+  LoggingOpResolver(const BuiltinOpsSet& ops_to_replace,
+                    const OpResolver& base_resolver,
+                    KernelEvalFuncPtr logging_eval_fn);
+
+  const TfLiteRegistration* FindOp(BuiltinOperator op,
+                                   int version) const override;
+
+  KernelEvalFuncPtr GetWrappedKernelInvoke(BuiltinOperator op,
+                                           int version) const;
+  const TfLiteRegistration* FindOp(const char* op, int version) const override;
+
+ private:
+  BuiltinOpsMap<std::unique_ptr<TfLiteRegistration>>
+      builtin_op_registration_map_;
+  BuiltinOpsMap<KernelEvalFuncPtr> builtin_op_evalfn_map_;
+};
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_LOGGING_OP_RESOLVER_H_
diff --git a/tensorflow/lite/tools/optimize/logging_op_resolver_test.cc b/tensorflow/lite/tools/optimize/logging_op_resolver_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..18c29abec65de748184cc24c31d5ddd81ce21b0f
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/logging_op_resolver_test.cc
@@ -0,0 +1,141 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/logging_op_resolver.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+namespace {
+
+TfLiteStatus ConvPrepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus ConvEval(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus AddPrepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus AddEval(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus WrappingInvoke(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TEST(LoggingOpResolverTest, KernelInvokesAreReplaced) {
+  MutableOpResolver base_resolver;
+  TfLiteRegistration conv_registration = {
+      .prepare = ConvPrepare,
+      .invoke = ConvEval,
+  };
+  base_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &conv_registration);
+
+  TfLiteRegistration add_registration = {
+      .prepare = AddPrepare,
+      .invoke = AddEval,
+  };
+  base_resolver.AddBuiltin(BuiltinOperator_ADD, &add_registration);
+  BuiltinOpsSet ops_to_replace = {
+      {BuiltinOperator_CONV_2D, /*version*/ 1},
+      {BuiltinOperator_ADD, /*version*/ 1},
+  };
+
+  LoggingOpResolver resolver(ops_to_replace, base_resolver, WrappingInvoke);
+
+  auto reg = resolver.FindOp(BuiltinOperator_CONV_2D, 1);
+
+  EXPECT_EQ(reg->builtin_code, BuiltinOperator_CONV_2D);
+  EXPECT_TRUE(reg->prepare == ConvPrepare);
+  EXPECT_TRUE(reg->invoke == WrappingInvoke);
+
+  reg = resolver.FindOp(BuiltinOperator_ADD, 1);
+
+  EXPECT_EQ(reg->builtin_code, BuiltinOperator_ADD);
+  EXPECT_TRUE(reg->prepare == AddPrepare);
+  EXPECT_TRUE(reg->invoke == WrappingInvoke);
+}
+
+TEST(LoggingOpResolverTest, OriginalKernelInvokesAreRetained) {
+  MutableOpResolver base_resolver;
+  TfLiteRegistration conv_registration = {
+      .prepare = ConvPrepare,
+      .invoke = ConvEval,
+  };
+  base_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &conv_registration);
+
+  TfLiteRegistration add_registration = {
+      .prepare = AddPrepare,
+      .invoke = AddEval,
+  };
+  base_resolver.AddBuiltin(BuiltinOperator_ADD, &add_registration);
+  BuiltinOpsSet ops_to_replace = {
+      {BuiltinOperator_CONV_2D, /*version*/ 1},
+      {BuiltinOperator_ADD, /*version*/ 1},
+  };
+
+  LoggingOpResolver resolver(ops_to_replace, base_resolver, WrappingInvoke);
+  auto kernel_invoke =
+      resolver.GetWrappedKernelInvoke(BuiltinOperator_CONV_2D, 1);
+  EXPECT_TRUE(kernel_invoke == ConvEval);
+  kernel_invoke = resolver.GetWrappedKernelInvoke(BuiltinOperator_ADD, 1);
+  EXPECT_TRUE(kernel_invoke == AddEval);
+}
+
+TEST(LoggingOpResolverTest, OnlyOpsInReplacementSetAreReplaces) {
+  MutableOpResolver base_resolver;
+  TfLiteRegistration conv_registration = {
+      .prepare = ConvPrepare,
+      .invoke = ConvEval,
+  };
+  base_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &conv_registration);
+
+  TfLiteRegistration add_registration = {
+      .prepare = AddPrepare,
+      .invoke = AddEval,
+  };
+  base_resolver.AddBuiltin(BuiltinOperator_ADD, &add_registration);
+  // Only replace conv2d
+  BuiltinOpsSet ops_to_replace = {
+      {BuiltinOperator_CONV_2D, /*version*/ 1},
+  };
+
+  LoggingOpResolver resolver(ops_to_replace, base_resolver, WrappingInvoke);
+  auto reg = resolver.FindOp(BuiltinOperator_CONV_2D, 1);
+  EXPECT_EQ(reg->builtin_code, BuiltinOperator_CONV_2D);
+  EXPECT_TRUE(reg->prepare == ConvPrepare);
+  EXPECT_TRUE(reg->invoke == WrappingInvoke);
+
+  reg = resolver.FindOp(BuiltinOperator_ADD, 1);
+  EXPECT_EQ(nullptr, reg);
+}
+
+}  // namespace
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  // On Linux, add: FLAGS_logtostderr = true;
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/tools/optimize/node_info_delegate.cc b/tensorflow/lite/tools/optimize/node_info_delegate.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ccaa69373fcf55adaef21a948089ea59821ca763
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/node_info_delegate.cc
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/node_info_delegate.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+
+namespace {
+// The prepare function for delegate that forwards the prepare call to the
+// delegate observer in node info delegate params.
+// The function simply calls a delegate observer OnDelegatePrepareMethod.
+TfLiteStatus NodeInfoDelegatePrepare(TfLiteContext* context,
+                                     TfLiteDelegate* delegate) {
+  if (delegate == nullptr) return TfLiteStatus::kTfLiteError;
+
+  NodeInfoDelegateParams* params =
+      reinterpret_cast<NodeInfoDelegateParams*>(delegate->data_);
+  return params->delegate_observer->OnDelegatePrepareCalled(context);
+}
+}  // namespace
+
+TfLiteDelegate CreateNodeInfoDelegate(NodeInfoDelegateParams* params) {
+  return {.data_ = params,
+          .Prepare = NodeInfoDelegatePrepare,
+          .CopyFromBufferHandle = nullptr,
+          .CopyToBufferHandle = nullptr,
+          .FreeBufferHandle = nullptr};
+}
+
+TfLiteStatus NodeInfoDelegateObserver::OnDelegatePrepareCalled(
+    TfLiteContext* context) {
+  context_ = context;
+  const size_t num_nodes = node_index_opinfo_map_.size();
+  for (size_t node_index = 0; node_index < num_nodes; node_index++) {
+    TfLiteNode* node = nullptr;
+    TfLiteRegistration* reg = nullptr;
+    TF_LITE_ENSURE_STATUS(
+        context->GetNodeAndRegistration(context, node_index, &node, &reg));
+    auto op_info = node_index_opinfo_map_.at(node_index);
+    op_info.registration = reg;
+    node_ptr_opinfo_map_->insert({node, op_info});
+  }
+
+  if (node_ptr_opinfo_map_->size() != node_index_opinfo_map_.size()) {
+    // Something wrong.
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/node_info_delegate.h b/tensorflow/lite/tools/optimize/node_info_delegate.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ee2ce1978cf87b104518c4b64e84df166cef32d
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/node_info_delegate.h
@@ -0,0 +1,67 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_NODE_INFO_DELEGATE_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_NODE_INFO_DELEGATE_H_
+
+#include <unordered_map>
+
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/tools/optimize/calibration_common.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+
+// An interface for delegate observer that can listen to TfLiteDelegate::Prepare
+// calls.
+class DelegateObserver {
+ public:
+  virtual TfLiteStatus OnDelegatePrepareCalled(TfLiteContext* context) = 0;
+  virtual ~DelegateObserver() {}
+};
+
+// The parameters for the node info delegate.
+struct NodeInfoDelegateParams {
+  DelegateObserver* delegate_observer;
+};
+
+// Creates a delegate with the given |params|.
+TfLiteDelegate CreateNodeInfoDelegate(NodeInfoDelegateParams* params);
+
+// A delegate observer that can construct the map from TfLiteNode* ->
+// OperatorInfo.
+class NodeInfoDelegateObserver : public DelegateObserver {
+ public:
+  NodeInfoDelegateObserver(
+      const std::unordered_map<int, OperatorInfo>& node_index_to_op,
+      std::unordered_map<const TfLiteNode*, OperatorInfo>* node_ptr_opinfo_map)
+      : node_index_opinfo_map_(node_index_to_op),
+        node_ptr_opinfo_map_(node_ptr_opinfo_map) {}
+
+  TfLiteStatus OnDelegatePrepareCalled(TfLiteContext* context) override;
+
+  // Returns the context that was used to called the prepare method.
+  const TfLiteContext* GetContext() const { return context_; }
+
+ private:
+  const TfLiteContext* context_ = nullptr;
+  const std::unordered_map<int, OperatorInfo>& node_index_opinfo_map_;
+  std::unordered_map<const TfLiteNode*, OperatorInfo>* node_ptr_opinfo_map_;
+};
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_NODE_INFO_DELEGATE_H_
diff --git a/tensorflow/lite/tools/optimize/node_info_delegate_test.cc b/tensorflow/lite/tools/optimize/node_info_delegate_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e762d5c0144fe7f37782dca2dc4bca57b1553450
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/node_info_delegate_test.cc
@@ -0,0 +1,152 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unordered_map>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/tools/optimize/node_info_delegate.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+namespace {
+
+class TestDelegateObserver : public DelegateObserver {
+ public:
+  explicit TestDelegateObserver(TfLiteStatus status_to_return)
+      : status_to_return_(status_to_return) {}
+
+  TfLiteStatus OnDelegatePrepareCalled(TfLiteContext* context) override {
+    num_times_called_++;
+    return status_to_return_;
+  }
+  int num_times_called() { return num_times_called_; }
+
+ private:
+  int num_times_called_ = 0;
+  TfLiteStatus status_to_return_;
+};
+
+TEST(NodeInfoDelegateTest, DelegateObserverIsCalled) {
+  TestDelegateObserver observer(kTfLiteOk);
+  NodeInfoDelegateParams params;
+  params.delegate_observer = &observer;
+  auto model = FlatBufferModel::BuildFromFile(
+      "third_party/tensorflow/lite/testdata/multi_add.bin");
+  ASSERT_TRUE(model);
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(InterpreterBuilder(*model,
+                               ops::builtin::BuiltinOpResolver{})(&interpreter),
+            kTfLiteOk);
+  ASSERT_TRUE(interpreter);
+  EXPECT_EQ(0, observer.num_times_called());
+  TfLiteDelegate delegate = CreateNodeInfoDelegate(&params);
+
+  auto status = interpreter->ModifyGraphWithDelegate(&delegate);
+  EXPECT_EQ(kTfLiteOk, status);
+  EXPECT_EQ(1, observer.num_times_called());
+}
+
+TEST(NodeInfoDelegateTest, ObserverErrorCausesModifyGraphFailure) {
+  // Observer returns error
+  TestDelegateObserver observer(kTfLiteError);
+  NodeInfoDelegateParams params;
+  params.delegate_observer = &observer;
+  auto model = FlatBufferModel::BuildFromFile(
+      "third_party/tensorflow/lite/testdata/multi_add.bin");
+  ASSERT_TRUE(model);
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(InterpreterBuilder(*model,
+                               ops::builtin::BuiltinOpResolver{})(&interpreter),
+            kTfLiteOk);
+  ASSERT_TRUE(interpreter);
+  TfLiteDelegate delegate = CreateNodeInfoDelegate(&params);
+
+  auto status = interpreter->ModifyGraphWithDelegate(&delegate);
+  EXPECT_EQ(kTfLiteError, status);
+}
+
+TEST(NodeInfoDelegateTest, NodeInfoDelegateObserver) {
+  auto model = FlatBufferModel::BuildFromFile(
+      "third_party/tensorflow/lite/testdata/multi_add.bin");
+  ASSERT_TRUE(model);
+
+  std::unordered_map<int, OperatorInfo> index_to_opinfo;
+  auto primary_subgraph = model->GetModel()->subgraphs()->Get(0);
+  auto operators = primary_subgraph->operators();
+  auto subgraph_tensors = primary_subgraph->tensors();
+  for (size_t i = 0; i < operators->size(); i++) {
+    OperatorInfo info;
+    auto op_inputs = operators->Get(i)->inputs();
+    auto op_outputs = operators->Get(i)->outputs();
+    info.inputs = std::vector<int>(op_inputs->begin(), op_inputs->end());
+    info.outputs = std::vector<int>(op_outputs->begin(), op_outputs->end());
+    index_to_opinfo[i] = info;
+  }
+
+  std::unordered_map<const TfLiteNode*, OperatorInfo> node_to_opinfo;
+  NodeInfoDelegateObserver observer(index_to_opinfo, &node_to_opinfo);
+  NodeInfoDelegateParams params;
+  params.delegate_observer = &observer;
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(InterpreterBuilder(*model,
+                               ops::builtin::BuiltinOpResolver{})(&interpreter),
+            kTfLiteOk);
+  ASSERT_TRUE(interpreter);
+
+  TfLiteDelegate delegate = CreateNodeInfoDelegate(&params);
+
+  auto status = interpreter->ModifyGraphWithDelegate(&delegate);
+  EXPECT_EQ(kTfLiteOk, status);
+  EXPECT_EQ(index_to_opinfo.size(), node_to_opinfo.size());
+  EXPECT_EQ(interpreter->nodes_size(), node_to_opinfo.size());
+
+  for (const auto& node_and_opinfo : node_to_opinfo) {
+    const TfLiteNode* tflite_node = node_and_opinfo.first;
+    const OperatorInfo& info = node_and_opinfo.second;
+    ASSERT_EQ(tflite_node->inputs->size, info.inputs.size());
+    ASSERT_EQ(tflite_node->outputs->size, info.outputs.size());
+
+    for (size_t input_index = 0; input_index < info.inputs.size();
+         input_index++) {
+      const TfLiteTensor* tflite_tensor =
+          interpreter->tensor(tflite_node->inputs->data[input_index]);
+      EXPECT_EQ(tflite_tensor->name,
+                subgraph_tensors->Get(info.inputs[input_index])->name()->str());
+    }
+
+    for (size_t output_index = 0; output_index < info.outputs.size();
+         output_index++) {
+      const TfLiteTensor* tflite_tensor =
+          interpreter->tensor(tflite_node->outputs->data[output_index]);
+      EXPECT_EQ(
+          tflite_tensor->name,
+          subgraph_tensors->Get(info.outputs[output_index])->name()->str());
+    }
+  }
+}
+
+}  // namespace
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  // On Linux, add: FLAGS_logtostderr = true;
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/tools/pip_package/setup.py b/tensorflow/lite/tools/pip_package/setup.py
index 64d62ee1f2d5d0cc1fa1d1804c637f8220937128..82f9ac5b8facd0d37dd37080c252c0f22710c9c2 100644
--- a/tensorflow/lite/tools/pip_package/setup.py
+++ b/tensorflow/lite/tools/pip_package/setup.py
@@ -136,7 +136,7 @@ setup(
     long_description='\n'.join(DOCLINES[2:]),
     url='https://www.tensorflow.org/lite/',
     author='Google Inc.',
-    author_email='opensource@google.com',
+    author_email='packages@tensorflow.org',
     license='Apache 2.0',
     include_package_data=True,
     keywords='tflite tensorflow tensor machine learning',
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 88800c295124cbb7e1f292c6970b81e3b0594ab3..bade64dcf822304191d666c516d130bf1ed0f8bc 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -53,6 +53,9 @@ tensorflow/third_party/toolchains/preconfig/generate/containers.bzl
 tensorflow/third_party/toolchains/preconfig/generate/generate.bzl
 tensorflow/third_party/toolchains/preconfig/generate/archives.bzl
 tensorflow/third_party/toolchains/preconfig/generate/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/dummy_toolchain.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/py3/BUILD
 tensorflow/third_party/toolchains/preconfig/win_1803/bazel_018/BUILD
 tensorflow/third_party/toolchains/preconfig/win_1803/bazel_018/dummy_toolchain.bzl
 tensorflow/third_party/toolchains/preconfig/win_1803/py36/BUILD
@@ -69,13 +72,6 @@ tensorflow/third_party/toolchains/cpus/arm/BUILD
 tensorflow/third_party/toolchains/cpus/py3/BUILD
 tensorflow/third_party/toolchains/cpus/py/BUILD
 tensorflow/third_party/toolchains/BUILD
-tensorflow/third_party/nccl/remote.BUILD.tpl
-tensorflow/third_party/nccl/archive.BUILD
-tensorflow/third_party/nccl/LICENSE
-tensorflow/third_party/nccl/system.BUILD.tpl
-tensorflow/third_party/nccl/nccl_configure.bzl
-tensorflow/third_party/nccl/build_defs.bzl.tpl
-tensorflow/third_party/nccl/BUILD
 tensorflow/third_party/gpus/BUILD
 tensorflow/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
 tensorflow/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
@@ -175,6 +171,12 @@ tensorflow/third_party/llvm/expand_cmake_vars.py
 tensorflow/third_party/llvm/llvm.autogenerated.BUILD
 tensorflow/third_party/llvm/llvm.bzl
 tensorflow/third_party/icu/udata.patch
+tensorflow/third_party/nccl/archive.BUILD
+tensorflow/third_party/nccl/LICENSE
+tensorflow/third_party/nccl/system.BUILD.tpl
+tensorflow/third_party/nccl/nccl_configure.bzl
+tensorflow/third_party/nccl/build_defs.bzl.tpl
+tensorflow/third_party/nccl/BUILD
 tensorflow/third_party/fft2d/BUILD
 tensorflow/third_party/fft2d/fft.h
 tensorflow/third_party/fft2d/LICENSE
@@ -239,7 +241,7 @@ tensorflow/third_party/tflite_ovic_testdata.BUILD
 tensorflow/third_party/libxsmm.BUILD
 tensorflow/third_party/zlib.BUILD
 tensorflow/third_party/eigen.BUILD
-tensorflow/stream_executor/BUILD
+tensorflow/stream_executor/build_defs.bzl
 tensorflow/api_template_v1.__init__.py
 tensorflow/compat_template_v1.__init__.py
 tensorflow/api_template.__init__.py
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index b31396da4b2725e7cf5a18614af535457b81f6af..933ccf1c7bdc2c642aec58eefce7bdfc94fd23fa 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -79,6 +79,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = [
         "//tensorflow:__pkg__",
+        "//tensorflow:internal",
         "//tensorflow/python/estimator:__subpackages__",
         "//tensorflow/python/keras:__subpackages__",
         "//tensorflow/python/tools:__pkg__",
@@ -168,6 +169,7 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/lite/python:lite",
         "//tensorflow/python/compat",
+        "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data",
         "//tensorflow/python/distribute",
         "//tensorflow/python/distribute:estimator_training",
@@ -493,19 +495,11 @@ tf_cc_shared_object(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "file_system_test",
     size = "small",
     srcs = ["framework/file_system_test.py"],
-    data = [":framework/test_file_system.so"],
-    main = "framework/file_system_test.py",
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",  # Path issues due to test environment
-        "no_windows",
-        "notap",
-    ],
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":data_flow_ops",
         ":framework",
@@ -514,57 +508,59 @@ py_test(
         ":platform",
         ":util",
     ],
+    data = [":framework/test_file_system.so"],
+    main = "framework/file_system_test.py",
+    tags = [
+        "no_pip",  # Path issues due to test environment
+        "no_windows",
+        "notap",
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "decorator_utils_test",
     srcs = ["util/decorator_utils_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":platform",
         ":util",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "tf_export_test",
     srcs = ["util/tf_export_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":platform",
         ":util",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "deprecation_test",
     srcs = ["util/deprecation_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":platform",
         ":util",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "dispatch_test",
     srcs = ["util/dispatch_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":platform",
         ":util",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "keyword_args_test",
     srcs = ["util/keyword_args_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":util",
     ],
@@ -811,13 +807,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "function_def_to_graph_test",
     size = "small",
     srcs = ["framework/function_def_to_graph_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":constant_op",
@@ -829,6 +823,7 @@ py_test(
         ":math_ops",
         ":test_ops",
     ],
+    tags = ["no_pip"],
 )
 
 py_library(
@@ -939,12 +934,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "auto_control_deps_test",
     size = "small",
     srcs = ["framework/auto_control_deps_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":auto_control_deps",
         ":client_testlib",
     ],
@@ -979,12 +973,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "smart_cond_test",
     size = "small",
     srcs = ["framework/smart_cond_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":constant_op",
         ":framework_ops",
@@ -1075,6 +1068,7 @@ py_library(
     name = "extra_py_tests_deps",
     srcs_version = "PY2AND3",
     deps = [
+        ":keras_lib",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -1089,7 +1083,6 @@ py_library(
         ":client",
         ":errors",
         ":framework_for_generated_wrappers",
-        ":layers_base",
         ":platform",
         ":platform_test",
         ":pywrap_tensorflow",
@@ -1104,7 +1097,6 @@ py_library(
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:tape",
-        "//tensorflow/python/keras:layers",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -1132,52 +1124,47 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "framework_registry_test",
     size = "small",
     srcs = ["framework/registry_test.py"],
-    main = "framework/registry_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_for_generated_wrappers",
         "//tensorflow/python:client_testlib",
     ],
+    main = "framework/registry_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_errors_test",
     size = "small",
     srcs = ["framework/errors_test.py"],
-    main = "framework/errors_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":errors",
         "//tensorflow/core:protos_all_py",
     ],
+    main = "framework/errors_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_error_interpolation_test",
     size = "small",
     srcs = ["framework/error_interpolation_test.py"],
-    main = "framework/error_interpolation_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":constant_op",
         ":error_interpolation",
         ":traceable_stack",
     ],
+    main = "framework/error_interpolation_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_subscribe_test",
     size = "small",
     srcs = ["framework/subscribe_test.py"],
-    main = "framework/subscribe_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -1186,50 +1173,48 @@ py_test(
         ":script_ops",
         ":subscribe",
     ],
+    main = "framework/subscribe_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "contrib_test",
     size = "small",
     srcs = ["framework/contrib_test.py"],
-    main = "framework/contrib_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
     ],
+    main = "framework/contrib_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "build_info_test",
     size = "small",
     srcs = [
         "platform/build_info.py",
         "platform/build_info_test.py",
     ],
+    additional_deps = [
+        ":client_testlib",
+        ":platform",
+    ],
     main = "platform/build_info_test.py",
-    srcs_version = "PY2AND3",
     tags = [
         "no_pip",
         "notap",
     ],
-    deps = [
-        ":client_testlib",
-        ":platform",
-    ],
 )
 
-py_test(
+tf_py_test(
     name = "proto_test",
     size = "small",
     srcs = ["framework/proto_test.py"],
-    main = "framework/proto_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
         "//third_party/py/numpy",
     ],
+    main = "framework/proto_test.py",
 )
 
 tf_gen_op_wrapper_private_py(
@@ -1292,25 +1277,22 @@ cuda_py_tests(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "framework_versions_test",
     size = "small",
     srcs = ["framework/versions_test.py"],
-    main = "framework/versions_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
     ],
+    main = "framework/versions_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_importer_test",
     size = "large",
     srcs = ["framework/importer_test.py"],
-    main = "framework/importer_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework",
@@ -1322,9 +1304,10 @@ py_test(
         ":random_ops",
         ":test_ops",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
     ],
+    main = "framework/importer_test.py",
 )
 
 filegroup(
@@ -1335,18 +1318,11 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
-py_test(
+tf_py_test(
     name = "framework_meta_graph_test",
     size = "small",
     srcs = ["framework/meta_graph_test.py"],
-    data = ["//tensorflow/python:meta_graph_testdata"],
-    main = "framework/meta_graph_test.py",
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "no_windows",
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":control_flow_ops",
@@ -1361,21 +1337,26 @@ py_test(
         ":training",
         ":variables",
     ],
+    data = ["//tensorflow/python:meta_graph_testdata"],
+    main = "framework/meta_graph_test.py",
+    tags = [
+        "no_pip",
+        "no_windows",
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "framework_traceable_stack_test",
     size = "small",
     srcs = ["framework/traceable_stack_test.py"],
-    main = "framework/traceable_stack_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_test_lib",
         ":platform_test",
         ":test_ops",
         ":traceable_stack",
         ":util",
     ],
+    main = "framework/traceable_stack_test.py",
 )
 
 tf_gen_op_wrapper_py(
@@ -1410,29 +1391,25 @@ cc_library(
     alwayslink = 1,
 )
 
-py_test(
+tf_py_test(
     name = "framework_common_shapes_test",
     size = "small",
     srcs = ["framework/common_shapes_test.py"],
-    main = "framework/common_shapes_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
         "//tensorflow/core:protos_all_py",
     ],
+    main = "framework/common_shapes_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_ops_test",
     size = "small",
     srcs = ["framework/ops_test.py"],
-    main = "framework/ops_test.py",
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],  # test_ops_2 is not available in pip.
-    deps = [
+    additional_deps = [
         ":cond_v2",
         ":control_flow_ops",
         ":errors",
@@ -1453,114 +1430,106 @@ py_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
     ],
+    main = "framework/ops_test.py",
+    tags = ["no_pip"],  # test_ops_2 is not available in pip.
 )
 
-py_test(
+tf_py_test(
     name = "framework_ops_enable_eager_test",
     size = "small",
     srcs = ["framework/ops_enable_eager_test.py"],
-    main = "framework/ops_enable_eager_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework",
         ":platform_test",
         "//tensorflow/python/eager:context",
     ],
+    main = "framework/ops_enable_eager_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_tensor_shape_test",
     size = "small",
     srcs = ["framework/tensor_shape_test.py"],
-    main = "framework/tensor_shape_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
         "//tensorflow/core:protos_all_py",
     ],
+    main = "framework/tensor_shape_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_tensor_spec_test",
     size = "small",
     srcs = ["framework/tensor_spec_test.py"],
-    main = "framework/tensor_spec_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
         ":tensor_spec",
         "//third_party/py/numpy",
     ],
+    main = "framework/tensor_spec_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_sparse_tensor_test",
     size = "small",
     srcs = ["framework/sparse_tensor_test.py"],
-    main = "framework/sparse_tensor_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
         "//tensorflow/core:protos_all_py",
     ],
+    main = "framework/sparse_tensor_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_device_test",
     size = "small",
     srcs = ["framework/device_test.py"],
-    main = "framework/device_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
         "//tensorflow/core:protos_all_py",
     ],
+    main = "framework/device_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_random_seed_test",
     size = "small",
     srcs = ["framework/random_seed_test.py"],
-    main = "framework/random_seed_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework",
     ],
+    main = "framework/random_seed_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_tensor_shape_div_test",
     size = "small",
     srcs = ["framework/tensor_shape_div_test.py"],
-    main = "framework/tensor_shape_div_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
-        "//tensorflow/core:protos_all_py",
         "@six_archive//:six",
+        "//tensorflow/core:protos_all_py",
     ],
+    main = "framework/tensor_shape_div_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_tensor_util_test",
     size = "small",
     srcs = ["framework/tensor_util_test.py"],
-    main = "framework/tensor_util_test.py",
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework",
@@ -1570,16 +1539,15 @@ py_test(
         ":state_ops_gen",
         "//third_party/py/numpy",
     ],
+    main = "framework/tensor_util_test.py",
+    tags = ["no_windows"],
 )
 
-py_test(
+tf_py_test(
     name = "framework_test_util_test",
     size = "small",
     srcs = ["framework/test_util_test.py"],
-    main = "framework/test_util_test.py",
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
+    additional_deps = [
         ":control_flow_ops",
         ":errors",
         ":framework_for_generated_wrappers",
@@ -1590,35 +1558,35 @@ py_test(
         ":session",
         ":test_ops",
         ":variables",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
+    main = "framework/test_util_test.py",
+    tags = ["no_windows"],
 )
 
-py_test(
+tf_py_test(
     name = "framework_dtypes_test",
     size = "small",
     srcs = ["framework/dtypes_test.py"],
-    main = "framework/dtypes_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
+        "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
         "//tensorflow/core:protos_all_py",
-        "//third_party/py/numpy",
     ],
+    main = "framework/dtypes_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "op_def_library_test",
     size = "small",
     srcs = ["framework/op_def_library_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
@@ -1626,18 +1594,17 @@ py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "framework_kernels_test",
     size = "small",
     srcs = ["framework/kernels_test.py"],
-    main = "framework/kernels_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_test_lib",
         ":kernels",
         ":platform_test",
         ":test_ops",
     ],
+    main = "framework/kernels_test.py",
 )
 
 tf_gen_op_wrapper_private_py(
@@ -2080,12 +2047,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "clip_ops_test",
     size = "small",
     srcs = ["ops/clip_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":clip_ops",
         ":framework_for_generated_wrappers",
@@ -2103,12 +2069,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "collective_ops_test",
     size = "small",
     srcs = ["ops/collective_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":collective_ops",
         ":framework_for_generated_wrappers",
@@ -2903,11 +2868,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "sparse_ops_test",
     srcs = ["ops/sparse_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":constant_op",
         ":dtypes",
         ":framework_test_lib",
@@ -2930,11 +2894,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "sort_ops_test",
     srcs = ["ops/sort_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework",
@@ -3348,7 +3311,6 @@ cuda_py_test(
         ":framework_test_lib",
         ":functional_ops",
         ":gradients",
-        ":layers",
         ":list_ops",
         ":math_grad",
         ":math_ops",
@@ -3734,24 +3696,17 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "evaluation_test",
     size = "small",
     srcs = ["training/evaluation_test.py"],
-    shard_count = 3,
-    srcs_version = "PY2AND3",
-    tags = [
-        "manual",
-        "notap",  # Disabling until b/33000128 and b/33040312 are fixed.
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
         ":framework",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
-        ":layers",
         ":math_ops",
         ":metrics",
         ":platform",
@@ -3759,9 +3714,14 @@ py_test(
         ":summary",
         ":training",
         ":variables",
+        "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/ops/losses",
-        "//third_party/py/numpy",
+    ],
+    shard_count = 3,
+    tags = [
+        "manual",
+        "notap",  # Disabling until b/33000128 and b/33040312 are fixed.
     ],
 )
 
@@ -3809,76 +3769,68 @@ py_library(
 )
 
 # Placeholder for intenal nest_test comments.
-py_test(
+tf_py_test(
     name = "util_nest_test",
     size = "small",
     srcs = ["util/nest_test.py"],
-    main = "util/nest_test.py",
-    srcs_version = "PY2AND3",
-    visibility = visibility + [
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":util",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
     ],
+    main = "util/nest_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "util_serialization_test",
     size = "small",
     srcs = ["util/serialization_test.py"],
-    main = "util/serialization_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":util",
     ],
+    main = "util/serialization_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "future_api_test",
     size = "small",
     srcs = ["util/future_api_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":util",
         "//tensorflow:tensorflow_py",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "function_utils_test",
     srcs = ["util/function_utils_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":util",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "tf_contextlib_test",
     size = "small",
     srcs = ["util/tf_contextlib_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":util",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "tf_decorator_test",
     size = "small",
     srcs = ["util/tf_decorator_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":util",
     ],
@@ -3896,23 +3848,21 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "tf_should_use_test",
     size = "small",
     srcs = ["util/tf_should_use_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":tf_should_use",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "tf_inspect_test",
     size = "small",
     srcs = ["util/tf_inspect_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":util",
     ],
@@ -3930,17 +3880,16 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "lock_util_test",
     size = "small",
     srcs = ["util/lock_util_test.py"],
-    main = "util/lock_util_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":util",
         "@absl_py//absl/testing:parameterized",
     ],
+    main = "util/lock_util_test.py",
 )
 
 tf_proto_library(
@@ -3969,28 +3918,25 @@ tf_proto_library(
     visibility = ["//tensorflow:internal"],
 )
 
-py_test(
+tf_py_test(
     name = "protobuf_compare_test",
     size = "small",
     srcs = ["util/protobuf/compare_test.py"],
-    main = "util/protobuf/compare_test.py",
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],  # compare_test_pb2 proto is not available in pip.
-    deps = [
+    additional_deps = [
         ":compare_test_proto_py",
         ":platform_test",
         ":util",
         "@six_archive//:six",
     ],
+    main = "util/protobuf/compare_test.py",
+    tags = ["no_pip"],  # compare_test_pb2 proto is not available in pip.
 )
 
-py_test(
+tf_py_test(
     name = "util_example_parser_configuration_test",
     size = "small",
     srcs = ["util/example_parser_configuration_test.py"],
-    main = "util/example_parser_configuration_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
@@ -3998,14 +3944,14 @@ py_test(
         ":parsing_ops",
         ":util_example_parser_configuration",
     ],
+    main = "util/example_parser_configuration_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "events_writer_test",
     size = "small",
     srcs = ["client/events_writer_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":errors",
         ":framework_test_lib",
         ":lib",
@@ -4620,24 +4566,22 @@ cuda_py_test(
     tags = ["no_windows_gpu"],
 )
 
-py_test(
+tf_py_test(
     name = "c_api_util_test",
     size = "small",
     srcs = ["framework/c_api_util_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":c_api_util",
         ":framework_test_lib",
         ":platform_test",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "graph_util_test",
     size = "small",
     srcs = ["framework/graph_util_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client",
         ":client_testlib",
         ":framework",
@@ -4650,37 +4594,34 @@ py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "bfloat16_test",
     size = "small",
     srcs = ["lib/core/bfloat16_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":lib",
         ":pywrap_tensorflow",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "file_io_test",
     size = "small",
     srcs = ["lib/io/file_io_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":errors",
         ":lib",
     ],
+    tags = ["no_windows"],
 )
 
-py_test(
+tf_py_test(
     name = "tf_record_test",
     size = "small",
     srcs = ["lib/io/tf_record_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":errors",
         ":lib",
@@ -4870,17 +4811,11 @@ cuda_py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "saver_large_variable_test",
     size = "medium",
     srcs = ["training/saver_large_variable_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "manual",
-        "noasan",  # http://b/30379628
-        "notsan",  # http://b/30379628
-    ],
-    deps = [
+    additional_deps = [
         ":client",
         ":client_testlib",
         ":errors",
@@ -4889,18 +4824,18 @@ py_test(
         ":variables",
         "//tensorflow/core:protos_all_py",
     ],
+    tags = [
+        "manual",
+        "noasan",  # http://b/30379628
+        "notsan",  # http://b/30379628
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "saver_large_partitioned_variable_test",
     size = "medium",
     srcs = ["training/saver_large_partitioned_variable_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "noasan",  # http://b/30782289
-        "notsan",  # http://b/30782289
-    ],
-    deps = [
+    additional_deps = [
         ":client",
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -4908,6 +4843,10 @@ py_test(
         ":training",
         ":variables",
     ],
+    tags = [
+        "noasan",  # http://b/30782289
+        "notsan",  # http://b/30782289
+    ],
 )
 
 cuda_py_test(
@@ -4953,16 +4892,11 @@ tf_py_test(
     tags = ["no_windows"],
 )
 
-py_test(
+tf_py_test(
     name = "basic_session_run_hooks_test",
     size = "medium",
     srcs = ["training/basic_session_run_hooks_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_windows",
-        "notsan",  # intermittent races on a few percent of runs
-    ],
-    deps = [
+    additional_deps = [
         ":client",
         ":client_testlib",
         ":control_flow_ops",
@@ -4979,21 +4913,17 @@ py_test(
         "//tensorflow/contrib/testing:testing_py",
         "//tensorflow/core:protos_all_py",
     ],
+    tags = [
+        "no_windows",
+        "notsan",  # intermittent races on a few percent of runs
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "checkpoint_utils_test",
     size = "small",
     srcs = ["training/checkpoint_utils_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "manual",
-        "no_cuda_on_cpu_tap",
-        "no_oss",
-        "no_windows",
-        "notap",
-    ],
-    deps = [
+    additional_deps = [
         ":client",
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -5007,14 +4937,20 @@ py_test(
         ":variable_scope",
         ":variables",
     ],
+    tags = [
+        "manual",
+        "no_cuda_on_cpu_tap",
+        "no_oss",
+        "no_windows",
+        "notap",
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "checkpoint_ops_test",
     size = "small",
     srcs = ["training/checkpoint_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":checkpoint_ops_gen",
         ":client",
         ":client_testlib",
@@ -5030,12 +4966,11 @@ py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "warm_starting_util_test",
     size = "medium",
     srcs = ["training/warm_starting_util_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":dtypes",
@@ -5044,21 +4979,15 @@ py_test(
         ":training",
         ":variable_scope",
         ":variables",
-        "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "monitored_session_test",
     size = "medium",
     srcs = ["training/monitored_session_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "notsan",  # b/67945581
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":checkpoint_management",
         ":client_testlib",
@@ -5077,6 +5006,10 @@ py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/distribute:distribute_coordinator",
     ],
+    tags = [
+        "no_pip",
+        "notsan",  # b/67945581
+    ],
 )
 
 py_library(
@@ -5098,12 +5031,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "training_util_test",
     size = "small",
     srcs = ["training/training_util_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework",
         ":platform",
@@ -5270,13 +5202,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "layers_base_test",
     size = "small",
     srcs = ["layers/base_test.py"],
-    main = "layers/base_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -5289,15 +5219,14 @@ py_test(
         ":variable_scope",
         "//tensorflow/python/eager:context",
     ],
+    main = "layers/base_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "layers_core_test",
     size = "small",
     srcs = ["layers/core_test.py"],
-    main = "layers/core_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -5310,15 +5239,14 @@ py_test(
         ":variables",
         "//third_party/py/numpy",
     ],
+    main = "layers/core_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "layers_convolutional_test",
     size = "small",
     srcs = ["layers/convolutional_test.py"],
-    main = "layers/convolutional_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -5327,32 +5255,31 @@ py_test(
         ":nn_ops",
         ":random_ops",
     ],
+    main = "layers/convolutional_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "layers_utils_test",
     size = "small",
     srcs = ["layers/utils_test.py"],
-    main = "layers/utils_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":layers",
     ],
+    main = "layers/utils_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "layers_pooling_test",
     size = "small",
     srcs = ["layers/pooling_test.py"],
-    main = "layers/pooling_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework_test_lib",
         ":layers",
         ":random_ops",
     ],
+    main = "layers/pooling_test.py",
 )
 
 cuda_py_test(
@@ -5377,46 +5304,43 @@ cuda_py_test(
 # -----------------------------------------------------------------------------
 # Quantization
 
-py_test(
+tf_py_test(
     name = "dequantize_op_test",
     size = "small",
     srcs = ["ops/dequantize_op_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework_for_generated_wrappers",
         "//third_party/py/numpy",
     ],
+    tags = ["no_windows"],
 )
 
-py_test(
+tf_py_test(
     name = "quantized_ops_test",
     size = "small",
     srcs = ["ops/quantized_ops_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework_for_generated_wrappers",
         "//third_party/py/numpy",
     ],
+    tags = ["no_windows"],
 )
 
-py_test(
+tf_py_test(
     name = "quantized_conv_ops_test",
     size = "small",
     srcs = ["ops/quantized_conv_ops_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":nn_ops",
         "//third_party/py/numpy",
     ],
+    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -5631,38 +5555,32 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "item_test",
     size = "small",
     srcs = [
         "grappler/item_test.py",
     ],
-    srcs_version = "PY2AND3",
-    tags = [
-        "grappler",
-        "no_pip",  # tf_optimizer is not available in pip.
-    ],
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":tf_item",
         "//tensorflow/core:protos_all_py",
     ],
+    tags = [
+        "grappler",
+        "no_pip",  # tf_optimizer is not available in pip.
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "datasets_test",
     size = "small",
     srcs = [
         "grappler/datasets_test.py",
     ],
-    srcs_version = "PY2AND3",
-    tags = [
-        "grappler",
-        "no_pip",  # tf_optimizer is not available in pip.
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -5670,6 +5588,10 @@ py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data",
     ],
+    tags = [
+        "grappler",
+        "no_pip",  # tf_optimizer is not available in pip.
+    ],
 )
 
 py_library(
@@ -5718,25 +5640,24 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "tf_optimizer_test",
     size = "small",
     srcs = [
         "grappler/tf_optimizer_test.py",
     ],
-    srcs_version = "PY2AND3",
-    tags = [
-        "grappler",
-        "no_pip",  # tf_optimizer is not available in pip.
-    ],
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":tf_item",
         ":tf_optimizer",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+    ],
+    tags = [
+        "grappler",
+        "no_pip",  # tf_optimizer is not available in pip.
     ],
 )
 
@@ -5753,32 +5674,28 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "graph_placer_test",
     size = "large",
     srcs = ["grappler/graph_placer_test.py"],
-    tags = [
-        "grappler",
-        "no_pip",  # graph_placer is not available in pip.
-    ],
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":graph_placer",
         "//tensorflow/python:math_ops",
     ],
+    tags = [
+        "grappler",
+        "no_pip",  # graph_placer is not available in pip.
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "memory_optimizer_test",
     size = "medium",
     srcs = [
         "grappler/memory_optimizer_test.py",
     ],
-    srcs_version = "PY2AND3",
-    tags = [
-        "grappler",
-    ],
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":math_ops",
@@ -5789,8 +5706,11 @@ py_test(
         ":training",
         ":variable_scope",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+    ],
+    tags = [
+        "grappler",
     ],
 )
 
@@ -5830,7 +5750,6 @@ cuda_py_test(
         ":constant_op",
         ":dtypes",
         ":functional_ops",
-        ":layers",
         ":math_ops",
         ":nn",
         ":ops",
@@ -5875,17 +5794,11 @@ py_binary(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "cost_analyzer_test",
     size = "small",
     srcs = ["grappler/cost_analyzer_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "grappler",
-        "no_cuda_on_cpu_tap",
-        "no_pip",
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":cost_analyzer",
@@ -5897,8 +5810,13 @@ py_test(
         ":state_ops",
         ":training",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+    ],
+    tags = [
+        "grappler",
+        "no_cuda_on_cpu_tap",
+        "no_pip",
     ],
 )
 
@@ -5911,24 +5829,23 @@ py_library(
     deps = [":pywrap_tensorflow_internal"],
 )
 
-py_test(
+tf_py_test(
     name = "model_analyzer_test",
     size = "small",
     srcs = ["grappler/model_analyzer_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "grappler",
-        "no_pip",
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":model_analyzer",
         ":state_ops",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+    ],
+    tags = [
+        "grappler",
+        "no_pip",
     ],
 )
 
@@ -5997,14 +5914,13 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "mode_keys_test",
     size = "small",
     srcs = [
         "training/mode_keys_test.py",
     ],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":mode_keys",
     ],
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 9f1e52b42bb73261e13ca37e29543242f682640e..31f9dce8e88b06bea790b41cf04faaf78df95419 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -121,6 +121,8 @@ from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import sysconfig
 from tensorflow.python.platform import test
 
+from tensorflow.python.compat import v2_compat
+
 from tensorflow.python.util.all_util import make_all
 from tensorflow.python.util.tf_export import tf_export
 
@@ -148,7 +150,7 @@ nn.rnn_cell = rnn_cell
 # pylint: disable=undefined-variable
 tf_export(v1=['AttrValue'])(AttrValue)
 tf_export(v1=['ConfigProto'])(ConfigProto)
-tf_export('Event', 'summary.Event')(Event)
+tf_export(v1=['Event', 'summary.Event'])(Event)
 tf_export(v1=['GPUOptions'])(GPUOptions)
 tf_export(v1=['GraphDef'])(GraphDef)
 tf_export(v1=['GraphOptions'])(GraphOptions)
@@ -161,10 +163,10 @@ tf_export(v1=['OptimizerOptions'])(OptimizerOptions)
 tf_export(v1=['RunMetadata'])(RunMetadata)
 tf_export(v1=['RunOptions'])(RunOptions)
 tf_export(v1=['SessionLog', 'summary.SessionLog'])(SessionLog)
-tf_export('Summary', 'summary.Summary')(Summary)
-tf_export('summary.SummaryDescription')(SummaryDescription)
-tf_export('SummaryMetadata')(SummaryMetadata)
-tf_export('summary.TaggedRunMetadata')(TaggedRunMetadata)
+tf_export(v1=['Summary', 'summary.Summary'])(Summary)
+tf_export(v1=['summary.SummaryDescription'])(SummaryDescription)
+tf_export(v1=['SummaryMetadata'])(SummaryMetadata)
+tf_export(v1=['summary.TaggedRunMetadata'])(TaggedRunMetadata)
 tf_export(v1=['TensorInfo'])(TensorInfo)
 # pylint: enable=undefined-variable
 
diff --git a/tensorflow/python/autograph/converters/logical_expressions.py b/tensorflow/python/autograph/converters/logical_expressions.py
index dfcaafdc9eba61bcb3c03432eadf309484d48dee..ea9740a22e1c065f04401fa3f15e8086349eb513 100644
--- a/tensorflow/python/autograph/converters/logical_expressions.py
+++ b/tensorflow/python/autograph/converters/logical_expressions.py
@@ -38,29 +38,29 @@ from tensorflow.python.autograph.pyct import templates
 SAFE_BOOLEAN_OPERAND = 'SAFE_BOOLEAN_OPERAND'
 
 
+OP_MAPPING = {
+    gast.And: 'ag__.and_',
+    gast.Eq: 'ag__.eq',
+    gast.NotEq: 'ag__.not_eq',
+    gast.Lt: 'ag__.lt',
+    gast.LtE: 'ag__.lt_e',
+    gast.Gt: 'ag__.gt',
+    gast.GtE: 'ag__.gt_e',
+    gast.Is: 'ag__.is_',
+    gast.IsNot: 'ag__.is_not',
+    gast.In: 'ag__.in_',
+    gast.Not: 'ag__.not_',
+    gast.NotIn: 'ag__.not_in',
+    gast.Or: 'ag__.or_',
+    gast.UAdd: 'ag__.u_add',
+    gast.USub: 'ag__.u_sub',
+    gast.Invert: 'ag__.invert',
+}
+
+
 class LogicalExpressionTransformer(converter.Base):
   """Converts logical expressions to corresponding TF calls."""
 
-  def __init__(self, ctx):
-    super(LogicalExpressionTransformer, self).__init__(ctx)
-    # TODO(mdan): For completeness and consistency, overload everything.
-    self.op_mapping = {
-        gast.And: 'ag__.and_',
-        gast.Eq: 'ag__.eq',
-        gast.NotEq: 'ag__.not_eq',
-        gast.Lt: 'ag__.lt',
-        gast.LtE: 'ag__.lt_e',
-        gast.Gt: 'ag__.gt',
-        gast.GtE: 'ag__.gt_e',
-        gast.Is: 'ag__.is_',
-        gast.IsNot: 'ag__.is_not',
-        gast.In: 'ag__.in_',
-        gast.Not: 'ag__.not_',
-        gast.NotIn: 'ag__.not_in',
-        gast.Or: 'ag__.or_',
-        gast.USub: 'ag__.u_sub',
-    }
-
   def _expect_simple_symbol(self, operand):
     if isinstance(operand, gast.Name):
       return
@@ -74,11 +74,11 @@ class LogicalExpressionTransformer(converter.Base):
 
   def _has_matching_func(self, operator):
     op_type = type(operator)
-    return op_type in self.op_mapping
+    return op_type in OP_MAPPING
 
   def _matching_func(self, operator):
     op_type = type(operator)
-    return self.op_mapping[op_type]
+    return OP_MAPPING[op_type]
 
   def _as_function(self, func_name, args, args_as_lambda=False):
     if args_as_lambda:
diff --git a/tensorflow/python/autograph/converters/logical_expressions_test.py b/tensorflow/python/autograph/converters/logical_expressions_test.py
index 687412750e0b2d3e7db275f6c25e5923ffaaa831..67ccd1fb47955053e0896df07e20903d4406370b 100644
--- a/tensorflow/python/autograph/converters/logical_expressions_test.py
+++ b/tensorflow/python/autograph/converters/logical_expressions_test.py
@@ -77,6 +77,13 @@ class LogicalExpressionTest(converter_testing.TestCase):
     with self.converted(test_fn, logical_expressions, {}) as result:
       self.assertTrue(result.test_fn('a', ('a',)))
 
+  def test_unary_ops(self):
+    def test_fn(a):
+      return ~a, -a, +a
+
+    with self.converted(test_fn, logical_expressions, {}) as result:
+      self.assertEqual(result.test_fn(1), (-2, -1, 1))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/impl/BUILD b/tensorflow/python/autograph/impl/BUILD
index 201a88875413982b0f1a791f3408b403a3259eb8..66f7915696ec400675810b8b954e6812294f0760 100644
--- a/tensorflow/python/autograph/impl/BUILD
+++ b/tensorflow/python/autograph/impl/BUILD
@@ -1,6 +1,6 @@
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 filegroup(
     name = "all_files",
@@ -37,25 +37,23 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "api_test",
     srcs = ["api_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":impl",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/utils",
-        "//third_party/py/numpy",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "conversion_test",
     srcs = ["conversion_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":impl",
-        "//tensorflow/python:client_testlib",
         "@gast_archive//:gast",
+        "//tensorflow/python:client_testlib",
     ],
 )
diff --git a/tensorflow/python/autograph/operators/BUILD b/tensorflow/python/autograph/operators/BUILD
index aedb901845b97bbee5918902875b5023a8604dcd..21a66c86b79e2116319bb240b138c6757484c6e0 100644
--- a/tensorflow/python/autograph/operators/BUILD
+++ b/tensorflow/python/autograph/operators/BUILD
@@ -38,6 +38,7 @@ py_library(
         "//tensorflow/python:list_ops",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python:util",
         "//tensorflow/python:variables",
         "//tensorflow/python/autograph/utils",
         "//tensorflow/python/data/ops:dataset_ops",
diff --git a/tensorflow/python/autograph/operators/__init__.py b/tensorflow/python/autograph/operators/__init__.py
index 7a580fe32475cbc32f20a1196c075fbf7f981d27..35f8028c295550443b98ca430d459967e03a6edf 100644
--- a/tensorflow/python/autograph/operators/__init__.py
+++ b/tensorflow/python/autograph/operators/__init__.py
@@ -52,6 +52,7 @@ from tensorflow.python.autograph.operators.logical import eq
 from tensorflow.python.autograph.operators.logical import gt
 from tensorflow.python.autograph.operators.logical import gt_e
 from tensorflow.python.autograph.operators.logical import in_
+from tensorflow.python.autograph.operators.logical import invert
 from tensorflow.python.autograph.operators.logical import is_
 from tensorflow.python.autograph.operators.logical import is_not
 from tensorflow.python.autograph.operators.logical import lt
@@ -60,6 +61,7 @@ from tensorflow.python.autograph.operators.logical import not_
 from tensorflow.python.autograph.operators.logical import not_eq
 from tensorflow.python.autograph.operators.logical import not_in
 from tensorflow.python.autograph.operators.logical import or_
+from tensorflow.python.autograph.operators.logical import u_add
 from tensorflow.python.autograph.operators.logical import u_sub
 from tensorflow.python.autograph.operators.py_builtins import float_
 from tensorflow.python.autograph.operators.py_builtins import int_
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index afa3787d4277985285d5dc8b3e1531a00460076b..035ea1bd9277a8dc66d9766cd00f5b8ccd6ad272 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -23,6 +23,7 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.util import nest
 
 
 def for_stmt(iter_, extra_test, body, init_state):
@@ -160,7 +161,8 @@ def while_stmt(test, body, init_state, extra_deps, opts=None):
   # TODO(mdan): Consider adding a generic mechanism for dynamic dispatch.
   # That could be something as simple as a collection of dispatch rules, with
   # some prioritization.
-  if any(tensor_util.is_tensor(v) for v in init_state + extra_deps):
+  if any(tensor_util.is_tensor(v)
+         for v in nest.flatten(init_state + extra_deps)):
     return _tf_while_stmt(test, body, init_state, opts)
   else:
     return _py_while_stmt(test, body, init_state, opts)
diff --git a/tensorflow/python/autograph/operators/control_flow_test.py b/tensorflow/python/autograph/operators/control_flow_test.py
index 0a7d4b64022f583bae4effc7d0f7eb04f46cc048..f9e006f7ad330aed3a130f2f1198f236aef15eea 100644
--- a/tensorflow/python/autograph/operators/control_flow_test.py
+++ b/tensorflow/python/autograph/operators/control_flow_test.py
@@ -36,7 +36,7 @@ class ForLoopTest(test.TestCase):
         extra_test=lambda s: True,
         body=lambda i, s: (s + i,),
         init_state=(0,))
-    with self.cached_session() as sess:
+    with self.cached_session():
       self.assertEqual((10,), self.evaluate(s))
 
   def test_python(self):
@@ -55,7 +55,7 @@ class ForLoopTest(test.TestCase):
         extra_test=lambda s: True,
         body=lambda i, s: (s + i,),
         init_state=(0,))
-    with self.cached_session() as sess:
+    with self.cached_session():
       self.assertEqual((10,), self.evaluate(s))
 
 
@@ -65,18 +65,30 @@ class WhileLoopTest(test.TestCase):
   def test_tensor(self):
     n = constant_op.constant(5)
     results = control_flow.while_stmt(
-        test=lambda i, s: i < n,
-        body=lambda i, s: (i + 1, s + i,),
+        test=lambda i, sum: i < n,
+        body=lambda i, sum: (i + 1, sum + i,),
         init_state=(0, 0),
         extra_deps=(n,))
-    with self.cached_session() as sess:
+    with self.cached_session():
       self.assertEqual((5, 10), self.evaluate(results))
 
+  @test_util.run_deprecated_v1
+  def test_tensor_dict_state(self):
+    n = 5
+    init_state = {'i': constant_op.constant(0), 'sum': constant_op.constant(0)}
+    results = control_flow.while_stmt(
+        test=lambda s: s['i'] < n,
+        body=lambda s: ({'i': s['i'] + 1, 'sum': s['sum'] + s['i']},),
+        init_state=(init_state,),
+        extra_deps=())
+    with self.cached_session():
+      self.assertEqual(({'i': 5, 'sum': 10},), self.evaluate(results))
+
   def test_python(self):
     n = 5
     results = control_flow.while_stmt(
-        test=lambda i, s: i < n,
-        body=lambda i, s: (i + 1, s + i),
+        test=lambda i, sum: i < n,
+        body=lambda i, sum: (i + 1, sum + i),
         init_state=(0, 0),
         extra_deps=(n,))
     self.assertEqual((5, 10), results)
@@ -93,7 +105,7 @@ class IfStmtTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def test_tensor(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
       t = self.single_return_if_stmt(constant_op.constant(True))
       self.assertEqual(1, self.evaluate(t))
       t = self.single_return_if_stmt(constant_op.constant(False))
@@ -105,7 +117,7 @@ class IfStmtTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def test_tensor_multiple_returns(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
       t = self.multi_return_if_stmt(constant_op.constant(True))
       self.assertAllEqual([1, 2], self.evaluate(t))
       t = self.multi_return_if_stmt(constant_op.constant(False))
diff --git a/tensorflow/python/autograph/operators/logical.py b/tensorflow/python/autograph/operators/logical.py
index 569db5b91bd7efb92ce2b8a8b8eb6eb773f4abcb..dadb0daf1ae22016d0cff2889472423149258ffb 100644
--- a/tensorflow/python/autograph/operators/logical.py
+++ b/tensorflow/python/autograph/operators/logical.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import operator
+
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_math_ops
@@ -35,7 +37,7 @@ def and_(a, b):
   a_val = a()
   if tensor_util.is_tensor(a_val):
     return _tf_lazy_and(a_val, b)
-  return _py_lazy_and(a_val, b)
+  return a_val and b()
 
 
 def _tf_lazy_and(cond, b):
@@ -44,17 +46,12 @@ def _tf_lazy_and(cond, b):
   return control_flow_ops.cond(cond, b, lambda: cond)
 
 
-def _py_lazy_and(cond, b):
-  """Lazy-eval equivalent of "and" in Python."""
-  return cond and b()
-
-
 def or_(a, b):
   """Functional form of "or". Uses lazy evaluation semantics."""
   a_val = a()
   if tensor_util.is_tensor(a_val):
     return _tf_lazy_or(a_val, b)
-  return _py_lazy_or(a_val, b)
+  return a_val or b()
 
 
 def _tf_lazy_or(cond, b):
@@ -63,16 +60,11 @@ def _tf_lazy_or(cond, b):
   return control_flow_ops.cond(cond, lambda: cond, b)
 
 
-def _py_lazy_or(cond, b):
-  """Lazy-eval equivalent of "or" in Python."""
-  return cond or b()
-
-
 def eq(a, b):
   """Functional form of "equal"."""
   if tensor_util.is_tensor(a) or tensor_util.is_tensor(b):
     return _tf_equal(a, b)
-  return _py_equal(a, b)
+  return a == b
 
 
 def _tf_equal(a, b):
@@ -80,11 +72,6 @@ def _tf_equal(a, b):
   return gen_math_ops.equal(a, b)
 
 
-def _py_equal(a, b):
-  """Overload of "equal" that falls back to Python's default implementation."""
-  return a == b
-
-
 def not_eq(a, b):
   """Functional form of "not-equal"."""
   return not_(eq(a, b))
@@ -92,25 +79,8 @@ def not_eq(a, b):
 
 # Default implementation for the remainings.
 
-
-def gt(a, b):
-  """Functional form of "less-than"."""
-  return a > b
-
-
-def gt_e(a, b):
-  """Functional form of "less-than"."""
-  return a >= b
-
-
-def is_(a, b):
-  """Functional form of "less-than"."""
-  return a is b
-
-
-def is_not(a, b):
-  """Functional form of "less-than"."""
-  return a is not b
+is_ = operator.is_
+is_not = operator.is_not
 
 
 def in_(a, b):
@@ -119,21 +89,16 @@ def in_(a, b):
   return a in b
 
 
-def lt(a, b):
-  """Functional form of "less-than"."""
-  return a < b
-
-
-def lt_e(a, b):
-  """Functional form of "less-than"."""
-  return a <= b
-
-
 def not_in(a, b):
   """Functional form of "less-than"."""
   return a not in b
 
+gt = operator.gt
+gt_e = operator.ge
+lt = operator.lt
+lt_e = operator.le
+
 
-def u_sub(a):
-  """Functional form of "unary-sub"."""
-  return -a
+u_add = operator.pos
+u_sub = operator.neg
+invert = operator.invert
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 87a200ed336735f4b4abd9b0ac2352e36f7b84e4..b97eb884b36ed2246d6bf59f215786114d719e0f 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -736,10 +736,11 @@ class BaseSession(SessionInterface):
     if self._session is not None:
       try:
         tf_session.TF_DeleteSession(self._session)
-      except AttributeError:
-        # At shutdown, `c_api_util` or `tf_session` may have been garbage
-        # collected, causing the above method calls to fail. In this case,
-        # silently leak since the program is about to terminate anyway.
+      except (AttributeError, TypeError):
+        # At shutdown, `c_api_util`, `tf_session`, or
+        # `tf_session.TF_DeleteSession` may have been garbage collected, causing
+        # the above method calls to fail. In this case, silently leak since the
+        # program is about to terminate anyway.
         pass
       self._session = None
 
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index c4a118a41406afc52586553b1d3f0b446005c46d..da6218663de8b02fcda3f3e67e68bb46e47e914a 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -2036,7 +2036,7 @@ class SessionTest(test_util.TensorFlowTestCase):
     with self.cached_session() as sess:
       a = array_ops.placeholder(dtype=dtypes.string)
       with self.assertRaisesRegexp(
-          TypeError, 'Type of feed value 1 with type <(\w+) \'int\'> is not'):
+          TypeError, r'Type of feed value 1 with type <(\w+) \'int\'> is not'):
         sess.run(a, feed_dict={a: 1})
 
 
diff --git a/tensorflow/python/compat/BUILD b/tensorflow/python/compat/BUILD
index 9f2ce8c676e77480106c525bdc9c6440c599acec..87dd5d7f669f2f1cfe8fb5068a96dbdab62897d4 100644
--- a/tensorflow/python/compat/BUILD
+++ b/tensorflow/python/compat/BUILD
@@ -4,13 +4,23 @@ exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
+py_library(
+    name = "v2_compat",
+    srcs = ["v2_compat.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python:tf2",
+        "//tensorflow/python:util",
+    ],
+)
+
 py_library(
     name = "compat",
     srcs = ["compat.py"],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python:tf2",
         "//tensorflow/python:util",
     ],
 )
@@ -24,3 +34,14 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
+
+tf_py_test(
+    name = "disable_v2_behavior_test",
+    size = "small",
+    srcs = ["disable_v2_behavior_test.py"],
+    additional_deps = [
+        ":v2_compat",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 3477576d2ffa3e248e58723b4794ccf2c2be71c4..638bd445a62b7a8225f48b9fde43689e0a0f3246 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -24,15 +24,10 @@ from __future__ import print_function
 
 import datetime
 
-from tensorflow.python import tf2
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import variable_scope
-
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 12, 26)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 1, 10)
 
 
 @tf_export("compat.forward_compatible")
@@ -138,40 +133,3 @@ def forward_compatibility_horizon(year, month, day):
     yield
   finally:
     _FORWARD_COMPATIBILITY_HORIZON = old_compat_date
-
-
-@tf_export(v1=["enable_v2_behavior"])
-def enable_v2_behavior():
-  """Enables TensorFlow 2.x behaviors.
-
-  This function can be called at the beginning of the program (before `Tensors`,
-  `Graphs` or other structures have been created, and before devices have been
-  initialized. It switches all global behaviors that are different between
-  TensorFlow 1.x and 2.x to behave as intended for 2.x.
-
-  This function is called in the main TensorFlow `__init__.py` file, user should
-  not need to call it, except during complex migrations.
-  """
-  tf2.enable()  # Switches TensorArrayV2 and control flow V2
-  ops.enable_eager_execution()
-  tensor_shape.enable_v2_tensorshape()  # Also switched by tf2
-  variable_scope.enable_resource_variables()
-
-
-@tf_export(v1=["disable_v2_behavior"])
-def disable_v2_behavior():
-  """Disables TensorFlow 2.x behaviors.
-
-  This function can be called at the beginning of the program (before `Tensors`,
-  `Graphs` or other structures have been created, and before devices have been
-  initialized. It switches all global behaviors that are different between
-  TensorFlow 1.x and 2.x to behave as intended for 1.x.
-
-  User can call this function to disable 2.x behavior during complex migrations.
-  """
-  tf2.disable()  # Switches TensorArrayV2 and control flow V2
-  ops.disable_eager_execution()
-  tensor_shape.disable_v2_tensorshape()  # Also switched by tf2
-  variable_scope.disable_resource_variables()
-
-
diff --git a/tensorflow/python/compat/disable_v2_behavior_test.py b/tensorflow/python/compat/disable_v2_behavior_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c247eac395ec3b71c2d1840964cc351b9b78de6d
--- /dev/null
+++ b/tensorflow/python/compat/disable_v2_behavior_test.py
@@ -0,0 +1,39 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for forward and backwards compatibility utilties."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+class DisableV2BehaviorTest(test.TestCase):
+
+  def test_basic(self):
+    t = constant_op.constant([1, 2, 3])  # creates a hidden context
+    self.assertTrue(isinstance(t, ops.EagerTensor))
+    v2_compat.disable_v2_behavior()
+    t = constant_op.constant([1, 2, 3])
+    self.assertFalse(isinstance(t, ops.EagerTensor))
+
+
+if __name__ == '__main__':
+  v2_compat.enable_v2_behavior()
+  test.main()
diff --git a/tensorflow/python/compat/v2_compat.py b/tensorflow/python/compat/v2_compat.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a94939ae11dbf28146ae12ab21d11990dbb2516
--- /dev/null
+++ b/tensorflow/python/compat/v2_compat.py
@@ -0,0 +1,64 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Switching v2 features on and off."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import tf2
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.layers import normalization
+from tensorflow.python.ops import variable_scope
+
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export(v1=["enable_v2_behavior"])
+def enable_v2_behavior():
+  """Enables TensorFlow 2.x behaviors.
+
+  This function can be called at the beginning of the program (before `Tensors`,
+  `Graphs` or other structures have been created, and before devices have been
+  initialized. It switches all global behaviors that are different between
+  TensorFlow 1.x and 2.x to behave as intended for 2.x.
+
+  This function is called in the main TensorFlow `__init__.py` file, user should
+  not need to call it, except during complex migrations.
+  """
+  tf2.enable()  # Switches TensorArrayV2 and control flow V2
+  ops.enable_eager_execution()
+  tensor_shape.enable_v2_tensorshape()  # Also switched by tf2
+  variable_scope.enable_resource_variables()
+  normalization.enable_v2_batch_normalization()
+
+
+@tf_export(v1=["disable_v2_behavior"])
+def disable_v2_behavior():
+  """Disables TensorFlow 2.x behaviors.
+
+  This function can be called at the beginning of the program (before `Tensors`,
+  `Graphs` or other structures have been created, and before devices have been
+  initialized. It switches all global behaviors that are different between
+  TensorFlow 1.x and 2.x to behave as intended for 1.x.
+
+  User can call this function to disable 2.x behavior during complex migrations.
+  """
+  tf2.disable()  # Switches TensorArrayV2 and control flow V2
+  ops.disable_eager_execution()
+  tensor_shape.disable_v2_tensorshape()  # Also switched by tf2
+  variable_scope.disable_resource_variables()
+  normalization.disable_v2_batch_normalization()
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 9362a3e8eb8c23643fc83bf821cbf6ea2ec8eaad..36478785c9155243e092bb498f332f031a2d0e46 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -23,6 +23,7 @@ py_test(
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
index 3324243c54351e297ae15c36bb56fcb5342e5ce5..71ae14b5e106f4dac750f65f494682cb549e2286 100644
--- a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
@@ -19,6 +19,8 @@ from __future__ import print_function
 
 import random
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.ops import grouping
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
@@ -72,10 +74,140 @@ def _get_record_shape(sparse):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class BucketBySequenceLengthTest(test_base.DatasetTestBase):
+class BucketBySequenceLengthTest(test_base.DatasetTestBase,
+                                 parameterized.TestCase):
+
+  # TODO(b/117581999): add eager coverage.
+  @parameterized.named_parameters(
+      ("WithoutPadding", True),
+      ("WithPadding", False),
+  )
+  def testSkipEagerBucketDropReminder(self, param_no_padding):
+
+    boundaries = [10, 20, 30]
+    batch_sizes = [10, 8, 4, 2]
+    lengths = [8, 13, 25, 35]
+
+    n_bucket_elements = [28, 7, 6, 5]
+    n_expected_batches = 5
+
+    # Expected sequence lengths of the individual batches.
+    expected_lengths = []
+
+    # Expected sum of all batches with an equal sequence length.
+    # <seq-length>: <expected-total-sum>
+    expected_sums = dict()
+
+    # Expected batch sizes of batches depending on the sequence length.
+    # <seq-length>: [batch1_size, ..., batchN_size]
+    expected_batch_sizes = dict()
+
+    for length, batch_size, bucket_elements in zip(lengths, batch_sizes,
+                                                   n_bucket_elements):
+      # Calculate the expected sum across all batches of a specific sequence length.
+      expected_sums[length] = \
+          (bucket_elements - bucket_elements % batch_size) * length
+      # Calculate the expected occurrence of individual batch sizes.
+      expected_batch_sizes[length] = \
+          [batch_size] * (bucket_elements // batch_size)
+      # Calculate the expected occurence of individual sequence lengths.
+      expected_lengths.extend([length] * (bucket_elements // batch_size))
+
+    def build_dataset(sparse):
+
+      def _generator():
+        # Produce 1 batch for each bucket
+        elements = []
+        for bucket_elements, length in zip(n_bucket_elements, lengths):
+          # Using only full sequences (opposed to the strategy employed in `testBucket`) makes
+          # checking the sum a lot easier.
+          record_len = length
+          for _ in range(bucket_elements):
+            elements.append([1] * record_len)
+        random.shuffle(elements)
+        for el in elements:
+          yield (_format_record(el, sparse),)
+
+      dataset = dataset_ops.Dataset.from_generator(
+          _generator, (_get_record_type(sparse),), (_get_record_shape(sparse),))
+      if sparse:
+        dataset = dataset.map(lambda x: (_to_sparse_tensor(x),))
+      return dataset
+
+    def _test_bucket_by_padding(no_padding):
+      dataset = build_dataset(sparse=no_padding)
+      dataset = dataset.apply(
+          grouping.bucket_by_sequence_length(
+              _element_length_fn,
+              boundaries,
+              batch_sizes,
+              no_padding=no_padding,
+              drop_remainder=True))
+
+      get_next = self.getNext(dataset)
+      batches = []
+      for _ in range(n_expected_batches):
+        batch, = self.evaluate(get_next())
+        batches.append(batch)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
+
+      generated_lengths = []
+
+      # <seq-length>: <total-sum>
+      generated_sums = dict()
+
+      # <seq-length>: [<batch_size>, ...]
+      generated_batch_sizes = dict()
+
+      for length, batch_size, bucket_elements in zip(lengths, batch_sizes,
+                                                     n_bucket_elements):
+        # Initialize the sum across all batches.
+        generated_sums[length] = 0
+        # Initialize the individual batch sizes.
+        generated_batch_sizes[length] = []
+
+      for batch in batches:
+        shape = batch.dense_shape if no_padding else batch.shape
+        length = shape[1]
+        generated_lengths.append(length)
+
+        batch_size = shape[0]
+        generated_batch_sizes[length].append(batch_size)
+
+        batch_sum = batch.values.sum() if no_padding else batch.sum()
+        generated_sums[length] += batch_sum
+
+      for l in lengths:
+        # Make sure the sum of the batch contents is correct for the individual sequence lengths.
+        self.assertEqual(
+            generated_sums[l], expected_sums[l], "Tensor sums did not match! "
+            "expected: {}, generated: {}".format(expected_sums, generated_sums))
+
+        # Make sure the individual batch sizes are generated as expected.
+        self.assertEqual(
+            sorted(generated_batch_sizes[l]), sorted(expected_batch_sizes[l]),
+            "Batch-sizes did not match! "
+            "expected: {}, generated: {}".format(
+                sorted(expected_batch_sizes[l]),
+                sorted(generated_batch_sizes[l])))
+
+      # Make sure the generated sequence lengths appear as often as expected.
+      self.assertEqual(
+          sorted(generated_lengths), sorted(expected_lengths),
+          "The generated sequence lengths did not match! "
+          "expected: {}, generated: {}".format(
+              sorted(expected_lengths), sorted(generated_lengths)))
+
+    _test_bucket_by_padding(param_no_padding)
 
   # TODO(b/117581999): add eager coverage.
-  def testSkipEagerBucket(self):
+  @parameterized.named_parameters(
+      ("WithoutPadding", True),
+      ("WithPadding", False),
+  )
+  def testSkipEagerBucket(self, param_no_padding):
 
     boundaries = [10, 20, 30]
     batch_sizes = [10, 8, 4, 2]
@@ -132,8 +264,7 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
       self.assertEqual(sorted(batch_sizes), sorted(batch_sizes_val))
       self.assertEqual(sorted(lengths), sorted(lengths_val))
 
-    for no_padding in (True, False):
-      _test_bucket_by_padding(no_padding)
+    _test_bucket_by_padding(param_no_padding)
 
   def testPadToBoundary(self):
 
@@ -218,7 +349,11 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
     self.assertAllEqual(batches[4], [[1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
                                      [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
 
-  def testTupleElements(self):
+  @parameterized.named_parameters(
+      ("WithoutPadding", True),
+      ("WithPadding", False),
+  )
+  def testTupleElements(self, param_no_padding):
 
     def build_dataset(sparse):
       def _generator():
@@ -246,11 +381,14 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
       self.assertEqual([None, None], shapes[0].as_list())
       self.assertEqual([None], shapes[1].as_list())
 
-    for no_padding in (True, False):
-      _test_tuple_elements_by_padding(no_padding)
+    _test_tuple_elements_by_padding(param_no_padding)
 
   # TODO(b/117581999): add eager coverage
-  def testSkipEagerBucketSparse(self):
+  @parameterized.named_parameters(
+      ("DoDropRemainder", True),
+      ("DoNotDropRemainder", False),
+  )
+  def testSkipEagerBucketSparse(self, param_drop_remainder):
     """Tests bucketing of sparse tensors (case where `no_padding` == True).
 
     Test runs on following dataset:
@@ -281,11 +419,16 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
       dataset = dataset.map(_to_sparse_tensor)
       return dataset
 
-    def _compute_expected_batches():
+    def _compute_expected_batches(drop_remainder):
       """Computes expected batch outputs and stores in a set."""
       all_expected_sparse_tensors = set()
       for bucket_start_len in range(min_len, max_len, bucket_size):
-        for batch_offset in range(0, bucket_size, batch_size):
+        if drop_remainder:
+          batch_offsets = [0]
+        else:
+          batch_offsets = range(0, bucket_size, batch_size)
+
+        for batch_offset in batch_offsets:
           batch_start_len = bucket_start_len + batch_offset
           batch_end_len = min(batch_start_len + batch_size,
                               bucket_start_len + bucket_size)
@@ -314,13 +457,14 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
       return all_sparse_tensors
     dataset = _build_dataset()
     boundaries = range(min_len + bucket_size + 1, max_len, bucket_size)
-    dataset = dataset.apply(grouping.bucket_by_sequence_length(
-        _element_length_fn,
-        boundaries,
-        [batch_size] * (len(boundaries) + 1),
-        no_padding=True))
+    dataset = dataset.apply(
+        grouping.bucket_by_sequence_length(
+            _element_length_fn,
+            boundaries, [batch_size] * (len(boundaries) + 1),
+            no_padding=True,
+            drop_remainder=param_drop_remainder))
     batches = _compute_batches(dataset)
-    expected_batches = _compute_expected_batches()
+    expected_batches = _compute_expected_batches(param_drop_remainder)
     self.assertEqual(batches, expected_batches)
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
index b80aab994e1754faccde5653de9149f32a5f862c..f5a15f4c848c536ac07636469ea1f8b762bd317e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
@@ -104,9 +104,7 @@ class StatsDatasetTestBase(test_base.DatasetTestBase):
         self._assertSummaryHasCountMoreOrEqualGeneralisedTag(
             summary_str, "::execution_time", float(i + 1))
       self._assertSummaryContains(summary_str,
-                                  dataset_name + "::num_parallel_calls")
-      self._assertSummaryContains(summary_str,
-                                  dataset_name + "::active_parallel_calls")
+                                  dataset_name + "::thread_utilization")
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(next_element())
     if function_processing_time:
diff --git a/tensorflow/python/data/experimental/ops/batching.py b/tensorflow/python/data/experimental/ops/batching.py
index f0cf7f0a9954044e20a1487fb357aa8b4c974263..ff8a9182c36a9684b543ccb80a11bfe1873fe768 100644
--- a/tensorflow/python/data/experimental/ops/batching.py
+++ b/tensorflow/python/data/experimental/ops/batching.py
@@ -553,8 +553,12 @@ class _MapAndBatchDataset(dataset_ops.UnaryDataset):
     """See `Dataset.map()` for details."""
     self._input_dataset = input_dataset
 
-    self._map_func = dataset_ops.StructuredFunctionWrapper(
-        map_func, "tf.data.experimental.map_and_batch()", dataset=input_dataset)
+    if isinstance(map_func, dataset_ops.StructuredFunctionWrapper):
+      self._map_func = map_func
+    else:
+      self._map_func = dataset_ops.StructuredFunctionWrapper(
+          map_func, "tf.data.experimental.map_and_batch()",
+          dataset=input_dataset)
     self._batch_size_t = ops.convert_to_tensor(
         batch_size, dtype=dtypes.int64, name="batch_size")
     self._num_parallel_calls_t = ops.convert_to_tensor(
diff --git a/tensorflow/python/data/experimental/ops/grouping.py b/tensorflow/python/data/experimental/ops/grouping.py
index 2435f0cfdb77ba607c90db66af499780288c324b..4e83acf6bbadc065adae1a6fe3da81bc6ff19d0e 100644
--- a/tensorflow/python/data/experimental/ops/grouping.py
+++ b/tensorflow/python/data/experimental/ops/grouping.py
@@ -130,7 +130,8 @@ def bucket_by_sequence_length(element_length_func,
                               padded_shapes=None,
                               padding_values=None,
                               pad_to_bucket_boundary=False,
-                              no_padding=False):
+                              no_padding=False,
+                              drop_remainder=False):
   """A transformation that buckets elements in a `Dataset` by length.
 
   Elements of the `Dataset` are grouped together by length and then are padded
@@ -160,6 +161,10 @@ def bucket_by_sequence_length(element_length_func,
       any elements with length longer than `max(bucket_boundaries)`.
     no_padding: `bool`, indicates whether to pad the batch features (features
       need to be either of type `tf.SparseTensor` or of same shape).
+    drop_remainder: (Optional.) A `tf.bool` scalar `tf.Tensor`, representing
+      whether the last batch should be dropped in the case it has fewer than
+      `batch_size` elements; the default behavior is not to drop the smaller
+      batch.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
@@ -209,7 +214,7 @@ def bucket_by_sequence_length(element_length_func,
       """Batch elements in dataset."""
       batch_size = window_size_fn(bucket_id)
       if no_padding:
-        return grouped_dataset.batch(batch_size)
+        return grouped_dataset.batch(batch_size, drop_remainder=drop_remainder)
       none_filler = None
       if pad_to_bucket_boundary:
         err_msg = ("When pad_to_bucket_boundary=True, elements must have "
@@ -227,7 +232,8 @@ def bucket_by_sequence_length(element_length_func,
       shapes = make_padded_shapes(
           padded_shapes or grouped_dataset.output_shapes,
           none_filler=none_filler)
-      return grouped_dataset.padded_batch(batch_size, shapes, padding_values)
+      return grouped_dataset.padded_batch(
+          batch_size, shapes, padding_values, drop_remainder=drop_remainder)
 
     def _apply_fn(dataset):
       return dataset.apply(
diff --git a/tensorflow/python/data/experimental/ops/optimization_options.py b/tensorflow/python/data/experimental/ops/optimization_options.py
index 41a819d94bb88384c89cbc9b3eb0d4dc59575e0e..eb95484ac538f7878609b5afcd4ad03ef0347d6b 100644
--- a/tensorflow/python/data/experimental/ops/optimization_options.py
+++ b/tensorflow/python/data/experimental/ops/optimization_options.py
@@ -33,7 +33,7 @@ class OptimizationOptions(options.OptionsBase):
   ```python
   options = tf.data.Options()
   options.experimental_optimization.map_vectorization = True
-  options.apply_default_optimizations = False
+  options.experimental_optimization.apply_default_optimizations = False
   dataset = dataset.with_options(options)
   ```
   """
diff --git a/tensorflow/python/data/experimental/ops/stats_aggregator.py b/tensorflow/python/data/experimental/ops/stats_aggregator.py
index d5fcc033ab7df34369e0680275df744c431ed069..3e4c66be27018d25d4877d26ac565b4500633d0d 100644
--- a/tensorflow/python/data/experimental/ops/stats_aggregator.py
+++ b/tensorflow/python/data/experimental/ops/stats_aggregator.py
@@ -45,7 +45,7 @@ class StatsAggregator(object):
 
   # Apply `StatsOptions` to associate `dataset` with `aggregator`.
   options = dataset_ops.Options()
-  options.experimental_stats = tf.data.experimental.StatsOptions(aggregator)
+  options.experimental_stats.aggregator = aggregator
   dataset = dataset.with_options(options)
   ```
 
diff --git a/tensorflow/python/data/kernel_tests/interleave_test.py b/tensorflow/python/data/kernel_tests/interleave_test.py
index 4fb61b2daf125ce08a3ba99d81c7721a7fb6dc0a..4b427ff5a4173d73171400a2d3f36cbdfd416cdd 100644
--- a/tensorflow/python/data/kernel_tests/interleave_test.py
+++ b/tensorflow/python/data/kernel_tests/interleave_test.py
@@ -17,19 +17,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import threading
-
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.data.experimental.ops import threading_options
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
@@ -78,47 +74,6 @@ def _interleave(lists, cycle_length, block_length):
           break
 
 
-def _make_coordinated_sloppy_dataset(input_values, cycle_length, block_length,
-                                     num_parallel_calls):
-  """Produces a dataset iterator and events to control the order of elements.
-
-  Args:
-    input_values: the values to generate lists to interleave from
-    cycle_length: the length of the interleave cycle
-    block_length: the length of the interleave block
-    num_parallel_calls: the degree of interleave parallelism
-
-  Returns:
-    A dataset iterator (represented as `get_next` op) and events that can be
-    used to control the order of output elements.
-  """
-
-  # Set up threading events used to sequence when items are produced that
-  # are subsequently interleaved. These events allow us to deterministically
-  # simulate slowdowns and force sloppiness.
-  coordination_events = {i: threading.Event() for i in input_values}
-
-  def map_py_fn(x):
-    coordination_events[x].wait()
-    coordination_events[x].clear()
-    return x * x
-
-  def map_fn(x):
-    return script_ops.py_func(map_py_fn, [x], x.dtype)
-
-  def interleave_fn(x):
-    dataset = dataset_ops.Dataset.from_tensors(x)
-    dataset = dataset.repeat(x)
-    return dataset.map(map_fn)
-
-  options = dataset_ops.Options()
-  options.experimental_deterministic = False
-  dataset = dataset_ops.Dataset.from_tensor_slices(input_values).repeat(
-      2).interleave(interleave_fn, cycle_length, block_length,
-                    num_parallel_calls).with_options(options)
-  return dataset, coordination_events
-
-
 def _repeat(values, count):
   """Produces a list of lists suitable for testing interleave.
 
@@ -252,63 +207,37 @@ class InterleaveTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.evaluate(get_next())
 
   @parameterized.named_parameters(
-      ("1", np.int64([4, 5, 6]), 2, 1, 1),
-      ("2", np.int64([4, 5, 6]), 2, 1, 2),
-      ("3", np.int64([4, 5, 6]), 2, 3, 1),
-      ("4", np.int64([4, 5, 6]), 2, 3, 2),
-      ("5", np.int64([4, 5, 6]), 3, 2, 1),
-      ("6", np.int64([4, 5, 6]), 3, 2, 2),
-      ("7", np.int64([4, 5, 6]), 3, 2, 3),
-      ("8", np.int64([4, 0, 6]), 2, 3, 1),
-      ("9", np.int64([4, 0, 6]), 2, 3, 2),
+      ("1", np.int64([4, 5, 6]), 1, 3, 1),
+      ("2", np.int64([4, 5, 6]), 2, 1, 1),
+      ("3", np.int64([4, 5, 6]), 2, 1, 2),
+      ("4", np.int64([4, 5, 6]), 2, 3, 1),
+      ("5", np.int64([4, 5, 6]), 2, 3, 2),
+      ("6", np.int64([4, 5, 6]), 7, 2, 1),
+      ("7", np.int64([4, 5, 6]), 7, 2, 3),
+      ("8", np.int64([4, 5, 6]), 7, 2, 5),
+      ("9", np.int64([4, 5, 6]), 7, 2, 7),
+      ("10", np.int64([4, 0, 6]), 2, 3, 1),
+      ("11", np.int64([4, 0, 6]), 2, 3, 2),
   )
-  def testSloppyInterleaveInOrder(self, input_values, cycle_length,
+  def testSloppyInterleaveDataset(self, input_values, cycle_length,
                                   block_length, num_parallel_calls):
-    dataset, coordination_events = _make_coordinated_sloppy_dataset(
-        input_values, cycle_length, block_length, num_parallel_calls)
-    options = dataset_ops.Options()
-    options.experimental_threading = threading_options.ThreadingOptions()
-    options.experimental_threading.private_threadpool_size = (
-        num_parallel_calls + 1)
-    dataset = dataset.with_options(options)
-
-    get_next = self.getNext(dataset, requires_initialization=True)
-    for expected_element in _interleave(
-        _repeat(input_values, 2), cycle_length, block_length):
-      coordination_events[expected_element].set()
-      self.assertEqual(expected_element * expected_element,
-                       self.evaluate(get_next()))
-    with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next())
-
-  @parameterized.named_parameters(
-      ("1", np.int64([4, 5, 6]), 2, 1, 2),
-      ("2", np.int64([4, 5, 6]), 2, 3, 2),
-      ("3", np.int64([4, 5, 6]), 3, 2, 3),
-      ("4", np.int64([4, 0, 6]), 2, 3, 2),
-  )
-  def testSloppyInterleaveOutOfOrder(self, input_values, cycle_length,
-                                     block_length, num_parallel_calls):
-    dataset, coordination_events = _make_coordinated_sloppy_dataset(
-        input_values, cycle_length, block_length, num_parallel_calls)
+    count = 2
+    dataset = dataset_ops.Dataset.from_tensor_slices(input_values).repeat(
+        count).interleave(
+            lambda x: dataset_ops.Dataset.from_tensors(x).repeat(x),
+            cycle_length, block_length, num_parallel_calls)
     options = dataset_ops.Options()
-    options.experimental_threading = threading_options.ThreadingOptions()
-    options.experimental_threading.private_threadpool_size = (
-        num_parallel_calls + 1)
+    options.experimental_deterministic = False
     dataset = dataset.with_options(options)
-    get_next = self.getNext(dataset, requires_initialization=True)
-    elements = [
-        x for x in _interleave(
-            _repeat(input_values, 2), cycle_length, block_length)
+    expected_output = [
+        element for element in _interleave(
+            _repeat(input_values, count), cycle_length, block_length)
     ]
-    for i in [1, 4, 7]:
-      elements[i], elements[i + 1] = elements[i + 1], elements[i]
-
-    for element in elements:
-      coordination_events[element].set()
-      self.assertEqual(element * element, self.evaluate(get_next()))
-    with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next())
+    get_next = self.getNext(dataset)
+    actual_output = []
+    for _ in range(len(expected_output)):
+      actual_output.append(self.evaluate(get_next()))
+    self.assertAllEqual(expected_output.sort(), actual_output.sort())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 1ba00a8e6930284133f4807a60b49db334bf8da6..45e732be0d79a27105aa0d6ca2880bb7340c261b 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -970,8 +970,8 @@ class DatasetV2(object):
         shapes and types defined by `self.output_shapes` and
        `self.output_types`) to another nested structure of tensors.
       num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
-        representing the number elements to process in parallel. If not
-        specified, elements will be processed sequentially. If the value
+        representing the number elements to process asynchronously in parallel.
+        If not specified, elements will be processed sequentially. If the value
         `tf.data.experimental.AUTOTUNE` is used, then the number of parallel
         calls is set dynamically based on available CPU.
 
@@ -1756,8 +1756,8 @@ def make_initializable_iterator(dataset):
     RuntimeError: If eager execution is enabled.
   """
   try:
-    # Call the defined `make_one_shot_iterator()` if there is one, because some
-    # datasets (e.g. for prefetching) override its behavior.
+    # Call the defined `make_initializable_iterator()` if there is one, because
+    # some datasets (e.g. for prefetching) override its behavior.
     return dataset.make_initializable_iterator()
   except AttributeError:
     return DatasetV1Adapter(dataset).make_initializable_iterator()
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 1dcdb880f553422c53cd8323ff888dc2e1c60719..27a700f813cf0fd3896a85fd799b02776672795c 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -19,6 +19,7 @@ exports_files(["LICENSE"])
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "py_binary")
+load("//tensorflow:tensorflow.bzl", "if_not_v2")
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
 
 py_library(
@@ -406,9 +407,10 @@ py_library(
         ":debug_errors",
         ":debug_fibonacci",
         ":debug_keras",
+    ] + if_not_v2([
         ":debug_mnist",
         ":debug_tflearn_iris",
-    ],
+    ]),
 )
 
 py_binary(
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 02957b2fefbc0e58a9338a16e641ccb729e14ecc..a6a1c470b413958d524eed7488c35961b55d9912 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -124,6 +124,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":device_util",
+        ":numpy_dataset",
         ":reduce_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
@@ -219,7 +220,9 @@ py_library(
         ":cross_device_ops",
         ":device_util",
         ":distribute_lib",
+        ":input_lib",
         ":multi_worker_util",
+        ":numpy_dataset",
         ":reduce_util",
         ":shared_variable_creator",
         ":values",
@@ -241,6 +244,29 @@ py_library(
     ],
 )
 
+py_library(
+    name = "parameter_server_strategy",
+    srcs = ["parameter_server_strategy.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":input_lib",
+        ":mirrored_strategy",
+        ":numpy_dataset",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python/distribute:cross_device_ops",
+        "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:values",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
 py_library(
     name = "multi_worker_util",
     srcs = [
@@ -253,6 +279,52 @@ py_library(
     ],
 )
 
+py_library(
+    name = "numpy_dataset",
+    srcs = ["numpy_dataset.py"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "numpy_dataset_test",
+    size = "small",
+    srcs = ["numpy_dataset_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":numpy_dataset",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:test",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "input_lib",
+    srcs = ["input_lib.py"],
+    deps = [
+        ":device_util",
+        ":distribute_lib",
+        ":input_ops",
+        ":values",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:multi_device_iterator_ops",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
 py_library(
     name = "input_ops",
     srcs = ["input_ops.py"],
@@ -348,14 +420,12 @@ py_library(
     deps = [
         ":device_util",
         ":distribute_lib",
-        ":input_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:multi_device_iterator_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/training/checkpointable:base",
         "@six_archive//:six",
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
index 529a4434127f35a2e5b88468af23cd12608a301e..72b99907017f9fb03e79444d283334a568a34e87 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
@@ -192,11 +192,12 @@ class TPUClusterResolver(ClusterResolver):
     for the IP addresses and ports of each Cloud TPU listed.
 
     Args:
-      tpu: Either a string, or a list of strings corresponding to the TPUs to
-        use. If the single string is the empty string, the string 'local', or a
-        string that begins with 'grpc://' or '/bns', then it is assumed to not
-        correspond with a Cloud TPU and will instead be passed as the session
-        master and no ClusterSpec propagation will be done.
+      tpu: A string corresponding to the TPU to use. If the string is the empty
+        string, the string 'local', or a string that begins with 'grpc://' or
+        '/bns', then it is assumed to not correspond with a Cloud TPU and will
+        instead be passed as the session master and no ClusterSpec propagation
+        will be done. In the future, this may also support a list of strings
+        when multiple Cloud TPUs are used.
       zone: Zone where the TPUs are located. If omitted or empty, we will assume
         that the zone of the TPU is the same as the zone of the GCE VM, which we
         will try to discover from the GCE metadata service.
diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index 9575301d975e8ab797a0a9a79575b7f9bcbbb314..9729302c6dc1e22772c1a80a25eff17720c50994 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -323,6 +323,9 @@ class ReductionToOneDeviceCrossDeviceOps(CrossDeviceOps):
     assert check_destinations(destinations)
     devices = get_devices_from(destinations)
     reduce_to_device = self.reduce_to_device or devices[0]
+    logging.log_first_n(
+        logging.INFO,
+        "Reduce to %s then broadcast to %r." % (reduce_to_device, devices), 10)
     reduced = _simple_reduce(per_replica_value, reduce_to_device,
                              self.accumulation_fn, reduce_op)
     return self.broadcast(reduced, destinations)
@@ -839,9 +842,6 @@ class CollectiveAllReduce(CrossDeviceOps):
     if cross_device_utils.contains_indexed_slices(per_replica_value):
       raise ValueError(
           "`IndexSlices` is not supported for Collective All-Reduce.")
-    if context.executing_eagerly():
-      raise ValueError(
-          "Eager execution is not supported for Collective All-Reduce")
 
     all_reduced = self._batch_all_reduce(reduce_op, [per_replica_value])[0]
     device_map, logical_device = get_device_map_from(destinations)
@@ -865,9 +865,6 @@ class CollectiveAllReduce(CrossDeviceOps):
     if cross_device_utils.contains_indexed_slices(value_destination_pairs):
       raise ValueError(
           "`IndexSlices` is not supported for Collective All-Reduce.")
-    if context.executing_eagerly():
-      raise ValueError(
-          "Eager execution is not supported for Collective All-Reduce")
 
     all_devices_match = _all_devices_match(value_destination_pairs)
     if all_devices_match:
@@ -886,9 +883,6 @@ class CollectiveAllReduce(CrossDeviceOps):
 
   def _batch_all_reduce(self, reduce_op, per_replica_values):
     """All-reduce across all workers in a batch."""
-    if context.executing_eagerly():
-      raise ValueError(
-          "Eager execution with collective ops is not supported yet.")
 
     logging.log_first_n(
         logging.INFO, "Collective All-reduce invoked with batches size = %d, "
@@ -949,12 +943,9 @@ def _has_dgx1_like_links(gpu_links):
 
 def _choose_all_reduce_algorithm(device_links):
   if _has_dgx1_like_links(device_links):
-    logging.info("Configured hierarchical_copy with num_packs=%d",
-                 len(device_links))
     return AllReduceCrossDeviceOps(
         "hierarchical_copy", num_packs=len(device_links))
   else:
-    logging.info("Configured nccl all-reduce.")
     return AllReduceCrossDeviceOps("nccl", num_packs=1)
 
 
diff --git a/tensorflow/python/distribute/cross_device_utils.py b/tensorflow/python/distribute/cross_device_utils.py
index 5b4b3a6f978c0fd15bd5a67c3b47a46e5ae0e357..e8066dd467c285c50cb39b98450f5150756d6db9 100644
--- a/tensorflow/python/distribute/cross_device_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -23,6 +23,8 @@ import threading
 
 from tensorflow.python.distribute import all_reduce
 from tensorflow.python.distribute import values as value_lib
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -353,15 +355,25 @@ def build_collective_reduce(input_tensors,
   num_devices = len(devices)
   group_key = collective_keys.get_group_key(devices)
   instance_key = collective_keys.get_instance_key()
-  out_tensors = []
   subdiv_offsets = [0]  # TODO(tucker): maybe support non-default subdiv spec
-  for d in range(num_devices):
-    with ops.device(devices[d]):
-      reduce_op = collective_ops.all_reduce(
-          input_tensors[d], group_size, group_key, instance_key, reduction_op,
-          unary_op, subdiv_offsets)
-      out_tensors.append(reduce_op)
-  return out_tensors
+
+  def collective_all_reduce():
+    """Call collective allreduce."""
+    assert not context.executing_eagerly()
+    out_tensors = []
+    for d in range(num_devices):
+      with ops.device(devices[d]):
+        reduce_op = collective_ops.all_reduce(
+            input_tensors[d], group_size, group_key, instance_key, reduction_op,
+            unary_op, subdiv_offsets)
+        out_tensors.append(reduce_op)
+    return out_tensors
+
+  if context.executing_eagerly():
+    # Collective ops will block unless they are executed concurrently such as in
+    # a graph or a defun.
+    collective_all_reduce = def_function.function(collective_all_reduce)
+  return collective_all_reduce()
 
 
 def sum_grad_and_var_all_reduce(grad_and_vars,
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index 76cbdd53d9da59b3bed318461f31591de8611fd0..5fe77e5478e94a7fcfa21935e68612420b166afb 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -26,6 +26,7 @@ import enum
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.eager import context as eager_context
 from tensorflow.python.framework import constant_op
@@ -33,6 +34,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import losses_impl
@@ -208,12 +210,14 @@ class _SameScopeAgainContext(object):
 # TODO(yuefengz): add more replication modes.
 @tf_export("distribute.InputReplicationMode")
 class InputReplicationMode(enum.Enum):
-  """Replication mode for input function."""
+  """Replication mode for input function.
 
-  # The input function will be called on each worker independently, creating as
-  # many input pipelines as number of workers. Replicas will dequeue from the
-  # local Dataset on their worker. Distribution Strategy doesn't manage any
-  # state sharing between such separate input pipelines.
+  * `PER_WORKER`: The input function will be called on each worker
+    independently, creating as many input pipelines as number of workers.
+    Replicas will dequeue from the local Dataset on their worker.
+    `tf.distribute.Strategy` doesn't manage any state sharing between such
+    separate input pipelines.
+  """
   PER_WORKER = "PER_WORKER"
 
 
@@ -351,7 +355,8 @@ class DistributionStrategy(object):
     ```
 
     Args:
-      dataset_fn: A function that returns a `tf.data.Dataset`.
+      dataset_fn: A function that returns a `tf.data.Dataset` with per-replica
+        batching.
 
     Returns:
       A `PerReplicaDataset` that will produce data for each replica.
@@ -359,7 +364,7 @@ class DistributionStrategy(object):
     return self._extended._distribute_dataset(dataset_fn)  # pylint: disable=protected-access
 
   def make_dataset_iterator(self, dataset):
-    """Makes an iterator for input provided via input_dataset.
+    """Makes an iterator for input provided via `dataset`.
 
     Data from the given dataset will be distributed evenly across all the
     compute replicas. We will assume that the input dataset is batched by the
@@ -388,28 +393,36 @@ class DistributionStrategy(object):
     """Returns an iterator split across replicas created from an input function.
 
     The `input_fn` should take an `tf.distribute.InputContext` object where
-    information about input sharding can be accessed:
+    information about batching and input sharding can be accessed:
 
     ```
     def input_fn(input_context):
-      d = tf.data.Dataset.from_tensors([[1.]]).repeat()
+      batch_size = input_context.get_per_replica_batch_size(global_batch_size)
+      d = tf.data.Dataset.from_tensors([[1.]]).repeat().batch(batch_size)
       return d.shard(input_context.num_input_pipelines,
                      input_context.input_pipeline_id)
     with strategy.scope():
-      iterator = strategy.make_input_fn_iterator(
-          input_fn)
-      replica_results = strategy.extended.call_for_each_replica(
-          replica_fn, iterator.get_next())
+      iterator = strategy.make_input_fn_iterator(input_fn)
+      replica_results = strategy.experimental_run(replica_fn, iterator)
     ```
 
+    The `tf.data.Dataset` returned by `input_fn` should have a per-replica
+    batch size, which may be computed using
+    `input_context.get_per_replica_batch_size`.
+
     Args:
-      input_fn: A function that returns a `tf.data.Dataset`. This function is
-        expected to take an `tf.distribute.InputContext` object.
+      input_fn: A function taking a `tf.distribute.InputContext` object and
+        returning a `tf.data.Dataset`.
       replication_mode: an enum value of `tf.distribute.InputReplicationMode`.
-        Only `PER_WORKER` is supported currently.
+        Only `PER_WORKER` is supported currently, which means there will be
+        a single call to `input_fn` per worker. Replicas will dequeue from the
+        local `tf.data.Dataset` on their worker.
 
     Returns:
-      An iterator object that can be initialized and fetched next element.
+      An iterator object that should first be `.initialize()`-ed. It may then
+      either be passed to `strategy.experimental_run()` or you can
+      `iterator.get_next()` to get the next value to pass to
+      `strategy.extended.call_for_each_replica()`.
     """
     if replication_mode != InputReplicationMode.PER_WORKER:
       raise ValueError(
@@ -417,6 +430,40 @@ class DistributionStrategy(object):
     return self.extended._make_input_fn_iterator(  # pylint: disable=protected-access
         input_fn, replication_mode=replication_mode)
 
+  def experimental_make_numpy_iterator(
+      self, numpy_input, batch_size, num_epochs=1, shuffle=1024, session=None):
+    """Makes an iterator for input provided via a nest of numpy arrays.
+
+    Args:
+      numpy_input: A nest of NumPy input arrays that will be distributed evenly
+        across all replicas. Note that lists of Numpy arrays are stacked,
+        as that is normal `tf.data.Dataset` behavior.
+      batch_size: The number of entries from the array we should consume in one
+        step of the computation, across all replicas. This is the global batch
+        size. It should be divisible by `num_replicas_in_sync`.
+      num_epochs: The number of times to iterate through the examples. A value
+        of `None` means repeat forever.
+      shuffle: Size of buffer to use for shuffling the input examples.
+        Use `None` to disable shuffling.
+      session: (TensorFlow v1.x graph execution only) A session used for
+        initialization.
+
+    Returns:
+      An `tf.distribute.InputIterator` which returns inputs for each step of the
+      computation.  User should call `initialize` on the returned iterator.
+    """
+    ds = self.extended.experimental_make_numpy_dataset(
+        numpy_input, session=session)
+    if shuffle:
+      ds = ds.shuffle(shuffle)
+    if num_epochs != 1:
+      ds = ds.repeat(num_epochs)
+    # We need to use the drop_remainder argument to get a known static
+    # input shape which is required for TPUs.
+    drop_remainder = self.extended.experimental_require_static_shapes
+    ds = ds.batch(batch_size, drop_remainder=drop_remainder)
+    return self.make_dataset_iterator(ds)
+
   def experimental_run(self, fn, input_iterator=None):
     """Runs ops in `fn` on each replica, with inputs from `input_iterator`.
 
@@ -1082,6 +1129,29 @@ class DistributionStrategyExtended(object):
   def _make_input_fn_iterator(self, input_fn, replication_mode):
     raise NotImplementedError("must be implemented in descendants")
 
+  def experimental_make_numpy_dataset(self, numpy_input, session=None):
+    """Makes a dataset for input provided via a numpy array.
+
+    This avoids adding `numpy_input` as a large constant in the graph,
+    and copies the data to the machine or machines that will be processing
+    the input.
+
+    Args:
+      numpy_input: A nest of NumPy input arrays that will be distributed evenly
+        across all replicas. Note that lists of Numpy arrays are stacked,
+        as that is normal `tf.data.Dataset` behavior.
+      session: (TensorFlow v1.x graph execution only) A session used for
+        initialization.
+
+    Returns:
+      A `tf.data.Dataset` representing `numpy_input`.
+    """
+    _require_cross_replica_context_extended(self)
+    return self._experimental_make_numpy_dataset(numpy_input, session=session)
+
+  def _experimental_make_numpy_dataset(self, numpy_input, session):
+    raise NotImplementedError("must be implemented in descendants")
+
   def broadcast_to(self, tensor, destinations):
     """Mirror a tensor on one device to all worker devices.
 
@@ -1554,6 +1624,50 @@ class ReplicaContext(object):
     require_replica_context(self)
     return (device_util.current(),)
 
+  def all_reduce(self, reduce_op, value):
+    """All-reduces the given `Tensor` nest across replicas.
+
+    If `all_reduce` is called in any replica, it must be called in all replicas.
+    The nested structure and `Tensor` shapes must be identical in all replicas.
+
+    IMPORTANT: The ordering of communications must be identical in all replicas.
+
+    Example with two replicas:
+      Replica 0 `value`: {'a': 1, 'b': [40,  1]}
+      Replica 1 `value`: {'a': 3, 'b': [ 2, 98]}
+
+      If `reduce_op` == `SUM`:
+        Result (on all replicas): {'a': 4, 'b': [42, 99]}
+
+      If `reduce_op` == `MEAN`:
+        Result (on all replicas): {'a': 2, 'b': [21, 49.5]}
+
+    Args:
+      reduce_op: Reduction type, an instance of `tf.distribute.ReduceOp` enum.
+      value: The nested structure of `Tensor`s to all-reduced.
+        The structure must be compatible with `tf.nest`.
+
+    Returns:
+       A `Tensor` nest with the reduced `value`s from each replica.
+    """
+    def batch_all_reduce(strategy, *value_flat):
+      return strategy.extended.batch_reduce_to(
+          reduce_op, [(v, _batch_reduce_destination(v)) for v in value_flat])
+
+    if reduce_op in [reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN]:
+      # TODO(cjfj): Work out why `batch_reduce` doesn't return the correct grad.
+      @custom_gradient.custom_gradient
+      def grad_wrapper(*xs):
+        ys = self.merge_call(batch_all_reduce, args=xs)
+        # The gradient of an all-sum is itself an all-sum (all-mean, likewise).
+        return ys, lambda *dy_s: self.all_reduce(reduce_op, dy_s)
+      return nest.pack_sequence_as(value, grad_wrapper(*nest.flatten(value)))
+    else:
+      # TODO(cjfj): Implement gradients for other reductions.
+      reduced = nest.pack_sequence_as(
+          value, self.merge_call(batch_all_reduce, args=nest.flatten(value)))
+      return nest.map_structure(array_ops.prevent_gradient, reduced)
+
   # TODO(josh11b): Implement `start_all_reduce(method, t)` for efficient
   # all-reduce. It would return a function returning the result of reducing `t`
   # across all replicas. The caller would wait to call this function until they
@@ -1564,6 +1678,15 @@ class ReplicaContext(object):
   #   to that point that the first result is needed. Most likely this can be
   #   implemented in terms of `merge_call()` and `batch_reduce_to()`.
 
+
+def _batch_reduce_destination(x):
+  """Returns the destinations for batch all-reduce."""
+  if isinstance(x, ops.Tensor):  # One device strategies.
+    return x.device
+  else:
+    return x
+
+
 # ------------------------------------------------------------------------------
 
 
@@ -1606,6 +1729,18 @@ class _DefaultDistributionExtended(DistributionStrategyExtended):
                               replication_mode=InputReplicationMode.PER_WORKER):
     return input_fn(InputContext()).make_initializable_iterator()
 
+  def _experimental_make_numpy_dataset(self, numpy_input, session):
+    numpy_flat = nest.flatten(numpy_input)
+    vars_flat = tuple(
+        variable_scope.variable(array_ops.zeros(i.shape, i.dtype),
+                                trainable=False, use_resource=True)
+        for i in numpy_flat
+    )
+    for v, i in zip(vars_flat, numpy_flat):
+      numpy_dataset.init_var_from_numpy(v, i, session)
+    vars_nested = nest.pack_sequence_as(numpy_input, vars_flat)
+    return dataset_ops.Dataset.from_tensor_slices(vars_nested)
+
   def _broadcast_to(self, tensor, destinations):
     if destinations is None:
       return tensor
@@ -1689,6 +1824,7 @@ class _DefaultDistributionExtended(DistributionStrategyExtended):
   # TODO(priyag): Delete this once all strategies use global batch size.
   @property
   def _global_batch_size(self):
+    """Global and per-replica batching are equivalent for this strategy."""
     return True
 
 
diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
new file mode 100644
index 0000000000000000000000000000000000000000..c64eea1604e40910b7b31df56bdc9c992aeafd5b
--- /dev/null
+++ b/tensorflow/python/distribute/input_lib.py
@@ -0,0 +1,721 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Various classes representing distributed inputs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import multi_device_iterator_ops
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import input_ops
+from tensorflow.python.distribute import values
+from tensorflow.python.eager import context
+from tensorflow.python.framework import device as tf_device
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.util import nest
+
+
+class InputWorkers(object):
+  """A 1-to-many mapping from input worker devices to compute devices."""
+
+  def __init__(self, device_map, worker_device_pairs=None, logical_device=0):
+    """Initialize an `InputWorkers` object.
+
+    Args:
+      device_map: A `DeviceMap` with the computation devices fed by the
+        input workers.
+      worker_device_pairs: A sequence of pairs:
+        `(input device, a tuple of compute devices fed by that input device)`.
+      logical_device: The logical device of `device_map` to feed.
+    """
+    self._device_map = device_map
+    self._logical_device = logical_device
+    if worker_device_pairs is None:
+      worker_device_pairs = ((
+          device_util.canonicalize("/device:CPU:0"),
+          device_map.logical_to_actual_devices(logical_device)),)
+    self._input_worker_devices = tuple(d for d, _ in worker_device_pairs)
+    self._fed_devices = tuple(tuple(device_util.canonicalize(d) for d in f)
+                              for _, f in worker_device_pairs)
+    flattened = tuple(d for l in self._fed_devices for d in l)
+    assert (flattened ==
+            device_map.logical_to_actual_devices(logical_device)), (
+                "flattened: %s logical device %d: %s" %
+                (flattened, logical_device,
+                 device_map.logical_to_actual_devices(logical_device)))
+
+  @property
+  def device_map(self):
+    return self._device_map
+
+  @property
+  def logical_device(self):
+    return self._logical_device
+
+  @property
+  def num_workers(self):
+    return len(self._input_worker_devices)
+
+  @property
+  def worker_devices(self):
+    return self._input_worker_devices
+
+  def compute_devices_for_worker(self, worker_index):
+    return self._fed_devices[worker_index]
+
+  def __repr__(self):
+    devices = self.worker_devices
+    debug_repr = ",\n".join("  %d %s: %s" %
+                            (i, devices[i], self._fed_devices[i])
+                            for i in range(len(devices)))
+    return "%s:{\n%s\n  device_map: %s}" % (
+        self.__class__.__name__, debug_repr, self._device_map)
+
+
+class PerReplicaDataIterator(object):
+  """An iterator (like `tf.data.Iterator`) into a `PerReplicaDataset`."""
+
+  def __init__(self, iterator, input_workers, worker_index, prefetch_on_device):
+    assert isinstance(input_workers, InputWorkers)
+    self._iterator = iterator
+    self._input_workers = input_workers
+    self._worker_index = worker_index
+    self._prefetch_on_device = prefetch_on_device
+
+  @property
+  def initializer(self):
+    return self._iterator.initializer
+
+  def get_next_as_list(self, name=None):
+    """Scatter the input across devices."""
+    if self._prefetch_on_device:
+      data_list = self._iterator.get_next()
+    else:
+      batch = self._iterator.get_next(name=name)
+      data_list = []
+      def get_ith(i):
+        return lambda x: x[i]
+
+      devices = self._input_workers.compute_devices_for_worker(
+          self._worker_index)
+      for i, d in enumerate(devices):
+        v = nest.map_structure(get_ith(i), batch)
+        if context.executing_eagerly():
+          with ops.device(d):
+            v = nest.map_structure(array_ops.identity, v)
+        data_list.append(v)
+
+    return data_list
+
+  def get_next(self, name=None):
+    assert self._input_workers.num_workers == 1
+    data_list = self.get_next_as_list(name)
+    return values.regroup(self._input_workers.device_map, data_list)
+
+  @property
+  def output_classes(self):
+    return self._iterator.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._iterator.output_shapes
+
+  @property
+  def output_types(self):
+    return self._iterator.output_types
+
+
+class PerReplicaDataset(object):
+  """Like `tf.data.Dataset` split devices, producing `PerReplica` data."""
+
+  def __init__(self, dataset, input_workers, worker_index,
+               prefetch_on_device=None):
+    assert isinstance(input_workers, InputWorkers)
+    assert worker_index is not None
+    assert worker_index is not True  # pylint: disable=g-bool-id-comparison
+    assert worker_index is not False  # pylint: disable=g-bool-id-comparison
+    self._input_workers = input_workers
+    self._worker_index = worker_index
+
+    # Default to using prefetching, unless specified.
+    self._prefetch_on_device = prefetch_on_device
+    if self._prefetch_on_device is None:
+      self._prefetch_on_device = True
+
+    self._dataset = dataset
+    if not self._prefetch_on_device:
+      # TODO(priyag): If dropping remainder is not appropriate, find another
+      # approach to distributing the dataset when not possible to divide evenly.
+      # Possibly not an issue when we start using PartitionedDataset.
+      num_replicas = len(
+          self._input_workers.compute_devices_for_worker(self._worker_index))
+      self._dataset = self._dataset.batch(num_replicas, drop_remainder=True)
+    else:
+      self._replica_devices = self._input_workers.compute_devices_for_worker(
+          self._worker_index)
+
+  def make_one_shot_iterator(self):
+    """Get a one time use iterator for the distributed PerReplicaDataset."""
+    # Graph mode with one shot iterator is disabled.
+    if not context.executing_eagerly():
+      raise ValueError("Cannot create a one shot iterator. Please use "
+                       "`make_initializable_iterator()` instead.")
+    if self._prefetch_on_device:
+      dataset_iterator = multi_device_iterator_ops.MultiDeviceIterator(
+          self._dataset, self._replica_devices)
+    else:
+      dataset_iterator = dataset_ops.make_one_shot_iterator(self._dataset)
+    return PerReplicaDataIterator(
+        dataset_iterator,
+        self._input_workers,
+        self._worker_index,
+        prefetch_on_device=self._prefetch_on_device)
+
+  def make_initializable_iterator(self):
+    """Get an initializable iterator for the distributed PerReplicaDataset."""
+    # Eager mode generates already initialized iterators. Hence we cannot create
+    # an initializable iterator.
+    if context.executing_eagerly():
+      raise ValueError("Cannot create initializable iterator in Eager mode. "
+                       "Please use `make_one_shot_iterator` instead.")
+    if self._prefetch_on_device:
+      dataset_iterator = multi_device_iterator_ops.MultiDeviceIterator(
+          self._dataset, self._replica_devices)
+    else:
+      dataset_iterator = dataset_ops.make_initializable_iterator(self._dataset)
+    return PerReplicaDataIterator(
+        dataset_iterator, self._input_workers, self._worker_index,
+        prefetch_on_device=self._prefetch_on_device)
+
+
+class MultiWorkerDataIterator(object):
+  """An iterator (like `tf.data.Iterator`) into a `MultiWorkerDataset`."""
+
+  def __init__(self, iterators, input_workers):
+    """Initialize the `MultiWorkerDataIterator` object.
+
+    Args:
+      iterators: a list of worker, iterator pairs.
+      input_workers: an `InputWorkers` object.
+
+    Raises:
+      ValueError: if iterators and input_workers are not compatible.
+    """
+    assert isinstance(input_workers, InputWorkers)
+    workers = tuple(d for d, _ in iterators)
+    if workers != input_workers.worker_devices:
+      raise ValueError("iterators and input_workers are not compatible. "
+                       "iterator workers: %r input_workers devices: %r" %
+                       (workers, input_workers.worker_devices))
+    self._iterators = tuple(i for _, i in iterators)
+    self._input_workers = input_workers
+
+  @property
+  def initializer(self):
+    return control_flow_ops.group(
+        tuple(iterator.initializer for iterator in self._iterators))
+
+  def get_iterator(self, worker):
+    for i, w in enumerate(self._input_workers.worker_devices):
+      if worker == w:
+        return self._iterators[i]
+    return None
+
+  @property
+  def output_shapes(self):
+    return self._iterators[0].output_shapes
+
+  @property
+  def output_types(self):
+    return self._iterators[0].output_types
+
+  def get_next(self, name=None):
+    """Scatter the input across hosts and devices."""
+    replicas = []
+    for worker, iterator in zip(self._input_workers.worker_devices,
+                                self._iterators):
+      if name is not None:
+        d = tf_device.DeviceSpec.from_string(worker)
+        new_name = "%s_%s_%d" % (name, d.job, d.task)
+      else:
+        new_name = None
+      with ops.device(worker):
+        data_per_worker = iterator.get_next_as_list(name=new_name)
+        # Append to replicas to get a flat list of values indexed by replica.
+        replicas.extend(data_per_worker)
+
+    return values.regroup(self._input_workers.device_map, replicas)
+
+
+class MultiWorkerDataset(object):
+  """Like a `tf.data.Dataset` that distributes data to different workers.
+
+  Each worker gets one shard of the input dataset. This currently does not work
+  in eager mode.
+  """
+
+  def __init__(self, dataset_fn, input_workers, prefetch_on_device=None,
+               auto_shard=False):
+    """Initialize the MultiWorkerDataset object.
+
+    Args:
+      dataset_fn: a function or a list of functions that returns a
+        `tf.data.Dataset`.
+      input_workers: an `InputWorkers` object.
+      prefetch_on_device: whether to prefetch to devices.
+      auto_shard: whether to auto-shard the dataset.
+    """
+    assert isinstance(input_workers, InputWorkers)
+    if isinstance(dataset_fn, (list, tuple)):
+      if len(dataset_fn) != input_workers.num_workers:
+        raise ValueError("If `dataset_fn` is a list, it must have one entry "
+                         "per worker")
+    # TODO(rohanj): b/120673685 to track re-enabling auto sharding.
+    if auto_shard:
+      raise ValueError("Currently autosharding is not supported.")
+    self._input_workers = input_workers
+    self._datasets = []
+    # TODO(yuefengz, priyag): support different set of jobs for input
+    # processing.
+    for i, worker in enumerate(input_workers.worker_devices):
+      with ops.device(worker):
+        if isinstance(dataset_fn, (list, tuple)):
+          worker_input = dataset_fn[i]()
+        else:
+          worker_input = dataset_fn()
+        dataset = PerReplicaDataset(worker_input, input_workers, i,
+                                    prefetch_on_device=prefetch_on_device)
+        self._datasets.append((worker, dataset))
+
+  def make_one_shot_iterator(self):
+    iterators = []
+    for worker, dataset in self._datasets:
+      with ops.device(worker):
+        iterators.append((worker, dataset_ops.make_one_shot_iterator(dataset)))
+    return MultiWorkerDataIterator(iterators, self._input_workers)
+
+  def make_initializable_iterator(self):
+    iterators = []
+    for worker, dataset in self._datasets:
+      with ops.device(worker):
+        iterators.append(
+            (worker, dataset_ops.make_initializable_iterator(dataset)))
+    return MultiWorkerDataIterator(iterators, self._input_workers)
+
+
+class InputIterator(object):
+  """An input iterator, intended to be passed to `DistributionStrategy.run`."""
+
+  def get_next(self):
+    """Returns the next inputs for all replicas."""
+    raise NotImplementedError("must be implemented in descendants")
+
+  def initialize(self):
+    """Initialize the underlying input dataset, when applicable.
+
+    In eager mode, this will create a new iterator and return it.
+    In graph mode, this will initialize the same underlying iterator(s).
+
+    Users are required to call this if
+    - This iterator was returned from a call to `make_input_fn_iterator` with an
+      input function that returns a dataset.
+    - Or this iterator was returned from a call to `make_dataset_iterator`.
+
+    Returns:
+      A list of initialization ops to be executed.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+
+class InputIteratorImpl(InputIterator):
+  """Common implementation for all input iterators."""
+
+  def __init__(self, input_workers, iterators):
+    assert isinstance(input_workers, InputWorkers)
+    if not input_workers.worker_devices:
+      raise ValueError("Should have at least one worker for input iterator.")
+
+    self._iterators = iterators
+    self._input_workers = input_workers
+
+  def get_next(self, name=None):
+    """Returns the next input from the iterator for all replicas."""
+    replicas = []
+    for i, worker in enumerate(self._input_workers.worker_devices):
+      if name is not None:
+        d = tf_device.DeviceSpec.from_string(worker)
+        new_name = "%s_%s_%d" % (name, d.job, d.task)
+      else:
+        new_name = None
+      with ops.device(worker):
+        # Make `replicas` a flat list of values across all replicas.
+        replicas.extend(self._iterators[i].get_next_as_list(new_name))
+
+    return values.regroup(self._input_workers.device_map, replicas)
+
+  def initialize(self):
+    """Initialze underlying iterators.
+
+    Returns:
+      A list of any initializer ops that should be run.
+    """
+    init_ops = []
+    for it in self._iterators:
+      init_ops.extend(it.initialize())
+    return init_ops
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  @property
+  def output_classes(self):
+    return self._iterators[0].output_classes
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  @property
+  def output_shapes(self):
+    return self._iterators[0].output_shapes
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  @property
+  def output_types(self):
+    return self._iterators[0].output_types
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  def get_iterator(self, worker):
+    for i, w in enumerate(self._input_workers.worker_devices):
+      if worker == w:
+        return self._iterators[i]
+    return None
+
+
+class InputFunctionIterator(InputIteratorImpl):
+  """Iterator created from input function."""
+
+  def __init__(self, input_fn, input_workers, input_contexts):
+    """Make an iterator for input provided via an input function.
+
+    Currently implements PER_WORKER mode, in which the `input_fn` is called
+    once on each worker.
+
+    TODO(priyag): Add other replication modes.
+    TODO(priyag): Allow taking input function that returns a callable that
+    returns nest of tensors.
+
+    Args:
+      input_fn: Input function that returns a `tf.data.Dataset` object.
+      input_workers: an `InputWorkers` object.
+      input_contexts: A list of `InputContext` instances to be passed to call(s)
+        to `input_fn`. Length and order should match worker order in
+        `worker_device_pairs`.
+    """
+    assert isinstance(input_workers, InputWorkers)
+    if input_workers.num_workers != len(input_contexts):
+      raise ValueError(
+          "Number of input workers (%d) is not same as number of "
+          "input_contexts (%d)" %
+          (input_workers.num_workers, len(input_contexts)))
+
+    iterators = []
+    for i, ctx in enumerate(input_contexts):
+      worker = input_workers.worker_devices[i]
+      with ops.device(worker):
+        result = input_fn(ctx)
+        if not isinstance(result, dataset_ops.DatasetV2):
+          raise ValueError("input_fn must return a tf.data.Dataset.")
+        devices = input_workers.compute_devices_for_worker(i)
+        iterator = _SingleWorkerDatasetIterator(result, worker, devices)
+        iterators.append(iterator)
+
+    super(InputFunctionIterator, self).__init__(input_workers, iterators)
+
+
+class DatasetIterator(InputIteratorImpl):
+  """Iterator created from input dataset."""
+
+  def __init__(self, dataset, input_workers, split_batch_by=None):
+    """Make an iterator for the dataset on given devices.
+
+    If `split_batch_by` is not None, we "split" each batch of the
+    dataset by `split_batch_by` value. To achieve this, we first unbatch the
+    input dataset and then rebatch it with the per replica batch size that is
+    calculated using `global_batch_size // split_batch_by`.
+    The currently supported datasets are as follows:
+    `dataset.batch()` is the last operation on the dataset OR
+    `dataset.apply(map_and_batch)` is the last operation on the dataset OR
+    `dataset.batch().prefetch()` are the last 2 operations on the dataset OR
+    `dataset.apply(map_and_batch).prefetch()` are the last 2 operations.
+
+    TODO(priyag): Support multi worker / host cases properly by cloning
+    and sharding the dataset on each worker. Current setup will only work in
+    some cases, such as in-graph multi worker GPU case. If the input pipeline
+    has random shuffling (with a different seed on each worker), each worker
+    will see random input from the same overall dataset in each step. Otherwise,
+    each worker will see the same input in each step.
+
+    Args:
+      dataset: `tf.data.Dataset` that will be used as the input source.
+      input_workers: an `InputWorkers` object.
+      split_batch_by: Optional integer. If present, we "split" each batch of the
+        dataset by `split_batch_by` value.
+    """
+    assert isinstance(input_workers, InputWorkers)
+    if split_batch_by:
+      dataset = _split_dataset_batch(dataset, split_batch_by)
+
+    iterators = []
+    for i, worker in enumerate(input_workers.worker_devices):
+      with ops.device(worker):
+        worker_devices = input_workers.compute_devices_for_worker(i)
+        cloned_dataset = dataset
+        if not context.executing_eagerly():
+          cloned_dataset = input_ops._clone_dataset(dataset)  # pylint: disable=protected-access
+        iterator = _SingleWorkerDatasetIterator(cloned_dataset, worker,
+                                                worker_devices)
+        iterators.append(iterator)
+
+    super(DatasetIterator, self).__init__(input_workers, iterators)
+
+
+class _SingleWorkerDatasetIterator(object):
+  """Iterator for a single `tf.data.Dataset`."""
+
+  def __init__(self, dataset, worker, devices):
+    """Create iterator for the `dataset` to fetch data to worker's `devices` .
+
+    `MultiDeviceIterator` is used to prefetch input to the devices on the
+    given worker.
+
+    Args:
+      dataset: A `tf.data.Dataset` instance.
+      worker: Worker on which ops should be created.
+      devices: Distribute data from `dataset` to these devices.
+    """
+    self._dataset = dataset
+    self._worker = worker
+    self._devices = devices
+    self._make_iterator()
+
+  def _make_iterator(self):
+    """Make appropriate iterator on the dataset."""
+    with ops.device(self._worker):
+      self._iterator = multi_device_iterator_ops.MultiDeviceIterator(
+          self._dataset, self._devices)
+
+  def get_next_as_list(self, name=None):
+    """Get next element from the underlying iterator."""
+    del name
+    with ops.device(self._worker):
+      data_list = self._iterator.get_next()
+      return data_list
+
+  def initialize(self):
+    """Initialze underlying iterator.
+
+    In eager execution, this simply recreates the underlying iterator.
+    In graph execution, it returns the initializer ops for the underlying
+    iterator.
+
+    Returns:
+      A list of any initializer ops that should be run.
+    """
+    if context.executing_eagerly():
+      self._make_iterator()
+      return []
+    else:
+      return [self._iterator.initializer]
+
+  @property
+  def output_classes(self):
+    return self._iterator.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._iterator.output_shapes
+
+  @property
+  def output_types(self):
+    return self._iterator.output_types
+
+
+def _split_dataset_batch(dataset, split_batch_by):
+  """Divide a batch-ed dataset's batches into smaller batches."""
+  # TODO(sourabhbajaj): Remove this in lieu of distributed datasets
+  # pylint: disable=protected-access
+  def _get_batch_dataset(d):
+    """Get the underlying batch dataset from the dataset object."""
+    if isinstance(d, dataset_ops.DatasetV1Adapter):
+      d = d._dataset
+
+    if isinstance(d, (dataset_ops.BatchDataset, batching._MapAndBatchDataset)):
+      return d
+    elif isinstance(d, dataset_ops.PrefetchDataset):
+      return _get_batch_dataset(d._input_dataset)
+    raise ValueError(
+        "Unable to get batched dataset from the input dataset. `batch` "
+        "`map_and_batch` need to be the last operations on the dataset. "
+        "The batch operations can be followed by a prefetch.")
+
+  batched_dataset = _get_batch_dataset(dataset)
+  prev_dataset = batched_dataset._input_dataset
+
+  num_parallel_calls = None
+  map_func = None
+
+  if isinstance(batched_dataset, dataset_ops.BatchDataset):
+    batch_size = batched_dataset._batch_size
+    drop_remainder = batched_dataset._drop_remainder
+  elif isinstance(batched_dataset, batching._MapAndBatchDataset):
+    batch_size = batched_dataset._batch_size_t
+    drop_remainder = batched_dataset._drop_remainder_t
+    num_parallel_calls = batched_dataset._num_parallel_calls_t
+    map_func = batched_dataset._map_func
+
+  prefetch_buffer = None
+  if isinstance(dataset, dataset_ops.PrefetchDataset):
+    prefetch_buffer = dataset._buffer_size
+  elif (isinstance(dataset, dataset_ops.DatasetV1Adapter)
+        and isinstance(dataset._dataset, dataset_ops.PrefetchDataset)):
+    prefetch_buffer = dataset._dataset._buffer_size
+
+  if tensor_util.is_tensor(batch_size):
+    batch_size = tensor_util.constant_value(batch_size)
+
+  if tensor_util.is_tensor(drop_remainder):
+    drop_remainder = tensor_util.constant_value(drop_remainder)
+
+  if num_parallel_calls is not None and tensor_util.is_tensor(drop_remainder):
+    num_parallel_calls = tensor_util.constant_value(num_parallel_calls)
+
+  if batch_size % split_batch_by:
+    raise ValueError(
+        "Batch size %s cannot be sharded evenly across replicas %s" % (
+            batch_size, split_batch_by))
+  new_batch_size = batch_size // split_batch_by
+
+  if isinstance(batched_dataset, dataset_ops.BatchDataset):
+    dataset = prev_dataset.batch(new_batch_size, drop_remainder=drop_remainder)
+  elif isinstance(batched_dataset, batching._MapAndBatchDataset):
+    dataset = prev_dataset.apply(batching.map_and_batch(
+        map_func, new_batch_size, num_parallel_calls, drop_remainder))
+  # pylint: enable=protected-access
+
+  if prefetch_buffer is not None:
+    dataset = dataset.prefetch(prefetch_buffer)
+  return dataset
+
+
+class MultiStepContext(object):
+  """A context object that can be used to capture things when running steps.
+
+  This context object is useful when running multiple steps at a time using the
+  `experimental_run_steps_on_iterator` API. For e.g. it allows the user's step
+  function to specify which outputs to emit at what frequency. Currently it
+  supports capturing output from the last step, as well as capturing non tensor
+  outputs.  In the future it will be augmented to support other use cases such
+  as output each N steps.
+  """
+
+  def __init__(self):
+    """Initialize an output context.
+
+    Returns:
+      A context object.
+    """
+    self._last_step_outputs = {}
+    self._last_step_outputs_reduce_ops = {}
+    self._non_tensor_outputs = {}
+
+  @property
+  def last_step_outputs(self):
+    """A dictionary consisting of outputs to be captured on last step.
+
+    Keys in the dictionary are names of tensors to be captured, as specified
+    when `set_last_step_output` is called.
+    Values in the dictionary are the tensors themselves. If
+    `set_last_step_output` was called with a `reduce_op` for this output,
+    then the value is the reduced value.
+
+    Returns:
+      A dictionary with last step outputs.
+    """
+    return self._last_step_outputs
+
+  def _set_last_step_outputs(self, outputs):
+    """Replace the entire dictionary of last step outputs."""
+    if not isinstance(outputs, dict):
+      raise ValueError("Need a dictionary to set last_step_outputs.")
+    self._last_step_outputs = outputs
+
+  def set_last_step_output(self, name, output, reduce_op=None):
+    """Set `output` with `name` to be outputted from the last step.
+
+    Args:
+      name: String, name to identify the output. Doesn't need to match tensor
+        name.
+      output: The tensors that should be outputted with `name`. See below for
+        actual types supported.
+      reduce_op: Reduction method to use to reduce outputs from multiple
+        replicas. Required if `set_last_step_output` is called in a replica
+        context. Optional in cross_replica_context.
+        When present, the outputs from all the replicas are reduced using the
+        current distribution strategy's `reduce` method. Hence, the type of
+        `output` must be what's supported by the corresponding `reduce` method.
+        For e.g. if using MirroredStrategy and reduction is set, output
+        must be a `PerReplica` value.
+        The reduce method is also recorded in a dictionary
+        `_last_step_outputs_reduce_ops` for later interpreting of the
+        outputs as already reduced or not.
+    """
+    if distribution_strategy_context.in_cross_replica_context():
+      self._last_step_outputs_reduce_ops[name] = reduce_op
+      if reduce_op is None:
+        self._last_step_outputs[name] = output
+      else:
+        distribution = distribution_strategy_context.get_strategy()
+        self._last_step_outputs[name] = distribution.reduce(reduce_op, output)
+    else:
+      assert reduce_op is not None
+      def merge_fn(distribution, value):
+        self._last_step_outputs[name] = distribution.reduce(reduce_op, value)
+        # Setting this inside the `merge_fn` because all replicas share the same
+        # context object, so it's more robust to set it only once (even if all
+        # the replicas are trying to set the same value).
+        self._last_step_outputs_reduce_ops[name] = reduce_op
+
+      distribution_strategy_context.get_replica_context().merge_call(
+          merge_fn, args=(output,))
+
+  @property
+  def non_tensor_outputs(self):
+    """A dictionary consisting of any non tensor outputs to be captured."""
+    return self._non_tensor_outputs
+
+  def set_non_tensor_output(self, name, output):
+    """Set `output` with `name` to be captured as a non tensor output."""
+    if distribution_strategy_context.in_cross_replica_context():
+      self._non_tensor_outputs[name] = output
+    else:
+      def merge_fn(distribution, value):
+        # NOTE(priyag): For non tensor outputs, we simply return all the values
+        # in a list as reduction doesn't make sense on non tensors.
+        self._non_tensor_outputs[name] = distribution.unwrap(value)
+      distribution_strategy_context.get_replica_context().merge_call(
+          merge_fn, args=(output,))
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
index 601eafbb5ea97b7858b3dd57104b2b8c780873d1..1ed7eef1edb5ffcef593f5de469d4f27f2535911 100644
--- a/tensorflow/python/distribute/mirrored_strategy.py
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -27,7 +27,9 @@ from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import shared_variable_creator
 from tensorflow.python.distribute import values
@@ -456,9 +458,10 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
         "No duplicates allowed in `devices` argument: %s" % devices)
     # TODO(josh11b): Require at least 2 devices?
     self._device_map = values.ReplicaDeviceMap(devices)
-    self._input_workers = values.InputWorkers(self._device_map)
+    self._input_workers = input_lib.InputWorkers(self._device_map)
     self._inferred_cross_device_ops = cross_device_ops_lib.choose_the_best(
         devices)
+    self._host_input_device = numpy_dataset.SingleDevice("/cpu:0")
 
   def _initialize_multi_worker(self, devices):
     """Initializes the object for multi-worker training."""
@@ -487,9 +490,11 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
     # their ops will end up on the cpu device of its first worker, e.g.
     # "/job:worker/task:0/device:CPU:0". Note this is not used in replica mode.
     self._default_device = workers[0]
+    self._host_input_device = numpy_dataset.SingleDevice(workers[0])
 
     self._device_map = values.ReplicaDeviceMap(devices)
-    self._input_workers = values.InputWorkers(self._device_map, worker_devices)
+    self._input_workers = input_lib.InputWorkers(
+        self._device_map, worker_devices)
     self._inferred_cross_device_ops = cross_device_ops_lib.MultiWorkerAllReduce(
         workers, _infer_num_gpus_per_worker(devices))
 
@@ -499,6 +504,9 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
     if colocate_with is None:
       device_map = self._device_map
       logical_device = 0  # TODO(josh11b): Get logical device from scope here.
+    elif isinstance(colocate_with, numpy_dataset.SingleDevice):
+      with ops.device(colocate_with.device):
+        return next_creator(*args, **kwargs)
     else:
       device_map = colocate_with.device_map
       logical_device = colocate_with.logical_device
@@ -543,16 +551,16 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
   def _distribute_dataset(self, dataset_fn):
     if self._local_mode:
       worker_index = 0
-      return values.PerReplicaDataset(
+      return input_lib.PerReplicaDataset(
           self._call_dataset_fn(dataset_fn), self._input_workers, worker_index)
     else:
-      return values.MultiWorkerDataset(
+      return input_lib.MultiWorkerDataset(
           functools.partial(self._call_dataset_fn, dataset_fn),
           self._input_workers,
           auto_shard=False)
 
   def _make_dataset_iterator(self, dataset):
-    return values.DatasetIterator(
+    return input_lib.DatasetIterator(
         dataset, self._input_workers, self._num_replicas_in_sync)
 
   def _make_input_fn_iterator(
@@ -566,9 +574,13 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
           num_input_pipelines=num_workers,
           input_pipeline_id=i,
           num_replicas_in_sync=self._num_replicas_in_sync))
-    return values.InputFunctionIterator(
+    return input_lib.InputFunctionIterator(
         input_fn, self._input_workers, input_contexts)
 
+  def _experimental_make_numpy_dataset(self, numpy_input, session):
+    return numpy_dataset.one_host_numpy_dataset(
+        numpy_input, self._host_input_device, session)
+
   # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
   def _experimental_run_steps_on_iterator(self, fn, iterator, iterations,
                                           initial_loop_values=None):
@@ -576,7 +588,7 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
       initial_loop_values = {}
     initial_loop_values = nest.flatten(initial_loop_values)
 
-    ctx = values.MultiStepContext()
+    ctx = input_lib.MultiStepContext()
     def body(i, *args):
       """A wrapper around `fn` to create the while loop body."""
       del args
@@ -770,6 +782,14 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
   # TODO(priyag): Delete this once all strategies use global batch size.
   @property
   def _global_batch_size(self):
+    """`make_dataset_iterator` and `make_numpy_iterator` use global batch size.
+
+    `distribute_dataset` and `make_input_fn_iterator` assume per-replica
+    batching.
+
+    Returns:
+      Boolean.
+    """
     return True
 
 
diff --git a/tensorflow/python/distribute/numpy_dataset.py b/tensorflow/python/distribute/numpy_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..5881e4cd59e75ac5184e400bd0ac90443084635e
--- /dev/null
+++ b/tensorflow/python/distribute/numpy_dataset.py
@@ -0,0 +1,97 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Code for creating a dataset out of a NumPy array."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import nest
+
+
+def init_var_from_numpy(input_var, numpy_input, session):
+  """Initialize `input_var` to `numpy_input` using `session` in graph mode."""
+  with ops.init_scope():
+    if context.executing_eagerly():
+      input_var.assign(numpy_input)
+      return
+
+    assert session is not None
+    session.run(input_var.initializer)
+
+    start_placeholder = array_ops.placeholder(dtypes.int64, ())
+    end_placeholder = array_ops.placeholder(dtypes.int64, ())
+    slice_placeholder = array_ops.placeholder(input_var.dtype)
+    assign_slice_op = input_var[start_placeholder:end_placeholder].assign(
+        slice_placeholder)
+
+    # If each batch element is > 64 MB, then we copy each batch element
+    # individually. Otherwise, the slices will be < 128 MB. There might be
+    # padding which might mean that the slices are 128 MB even if the size of
+    # the tensor allocated is less than 128 MB.  This formula gives slices with
+    # size: ceil(64 MB / byte size per batch element) bytes.  Using ceil()
+    # guarantees we get a number >= 1.
+
+    # Calculate the size of each batch element.
+    byte_size_per_batch_element = (
+        np.prod(numpy_input.shape[1:]) * input_var.dtype.size)
+
+    # Calculate number of elements we want to copy per slice.
+    batch_size_per_slice = int(
+        np.ceil((64 << 20) / byte_size_per_batch_element))
+
+    # Copy slices of the above size starting at 0, except the last slice will be
+    # smaller.
+    start = 0
+    limit = numpy_input.shape[0]
+    while start < limit:
+      end = min(start + batch_size_per_slice, limit)
+      session.run(assign_slice_op, feed_dict={
+          start_placeholder: start,
+          end_placeholder: end,
+          slice_placeholder: numpy_input[start:end]})
+      start = end
+
+
+def one_host_numpy_dataset(numpy_input, colocate_with, session):
+  """Create a dataset on `colocate_with` from `numpy_input`."""
+  def create_colocated_variable(next_creator, *args, **kwargs):
+    kwargs["colocate_with"] = colocate_with
+    return next_creator(*args, **kwargs)
+
+  numpy_flat = nest.flatten(numpy_input)
+  with variable_scope.variable_creator_scope(create_colocated_variable):
+    vars_flat = tuple(variable_scope.variable(array_ops.zeros(i.shape, i.dtype),
+                                              trainable=False)
+                      for i in numpy_flat)
+  for v, i in zip(vars_flat, numpy_flat):
+    init_var_from_numpy(v, i, session)
+  vars_nested = nest.pack_sequence_as(numpy_input, vars_flat)
+  return dataset_ops.Dataset.from_tensor_slices(vars_nested)
+
+
+class SingleDevice(object):
+  """Used with `colocate_with` to create a non-mirrored variable."""
+
+  def __init__(self, device):
+    self.device = device
diff --git a/tensorflow/python/distribute/numpy_dataset_test.py b/tensorflow/python/distribute/numpy_dataset_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..04eae1daa2ee83040f4d9acb3a79baa6be16f402
--- /dev/null
+++ b/tensorflow/python/distribute/numpy_dataset_test.py
@@ -0,0 +1,44 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for numpy_dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.distribute import numpy_dataset
+from tensorflow.python.eager import test
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variable_scope
+
+
+class InitVarFromNumpyTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_creating_var_with_numpy_arrays(self):
+    with self.cached_session() as session:
+      x = np.asarray(np.random.random((64, 3)), dtype=np.float32)
+      initial = np.zeros_like(x)
+      var_x = variable_scope.variable(initial)
+      numpy_dataset.init_var_from_numpy(var_x, x, session)
+      val = self.evaluate(var_x.value())
+      # Verify that the numpy value is copied to the variable.
+      self.assertAllEqual(x, val)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e865082f12b21b444c1a2699c083fc6ad0ea5c5
--- /dev/null
+++ b/tensorflow/python/distribute/parameter_server_strategy.py
@@ -0,0 +1,547 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Classes implementing a multi-worker ps DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
+from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import numpy_dataset
+from tensorflow.python.distribute import values
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.eager import context
+from tensorflow.python.framework import device as tf_device
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import device_setter
+from tensorflow.python.util import nest
+
+_LOCAL_CPU = "/device:CPU:0"
+_LOCAL_GPU_0 = "/device:GPU:0"
+
+
+# TODO(yuefengz): maybe cache variables on local CPU.
+class ParameterServerStrategy(distribute_lib.DistributionStrategy):
+  """A parameter server DistributionStrategy.
+
+  This strategy class works for both local training and between-graph replicated
+  training for multiple workers. It uses `TFConfigClusterResolver` to detect
+  configurations for multi-worker training. In multi-worker training mode, i.e.
+  `TFConfigClusterResolver` has detected 'TF_CONFIG' environment variable and
+  'TF_CONFIG' has a cluster spec, variables and updates to those variables are
+  assigned to parameter servers and other operations are assigned to workers.
+  In local training mode, variables are assigned to local CPU or the only GPU.
+  When each worker has more than one GPU, operations will be replicated on these
+  GPUs. In both cases, operations are replicated but variables are not and these
+  workers share a common view for which paramater server a variable is assigned
+  to.
+
+  This class assumes between-graph replication will be used and works on a graph
+  for a particular worker. Note that each graph and worker is independent.
+  This means that while each worker will synchronously compute a single gradient
+  update across all GPUs, updates between workers proceed asynchronously.
+  Operations that occur only on the first replica (such as incrementing the
+  global step), will occur on the first replica *of every worker*.
+
+  It is expected to call `call_for_each_replica(fn, ...)` for any
+  operations which potentially can be replicated across replicas (i.e. multiple
+  GPUs) even if there is only CPU or one GPU. When defining the `fn`, extra
+  caution needs to be taken:
+
+  1) It is generally not recommended to open a device scope under the strategy's
+  scope. A device scope (i.e. calling `tf.device`) will be merged with or
+  override the device for operations but will not change the device for
+  variables.
+
+  2) It is also not recommended to open a colocation scope (i.e. calling
+  `tf.colocate_with`) under the strategy's scope. For colocating variables,
+  use `distribution.colocate_vars_with` instead. Colocation of ops will possibly
+  create conflicts of device assignment.
+  """
+
+  def __init__(self):
+    """Initializes this strategy with default TFConfigClusterResolver."""
+    super(ParameterServerStrategy, self).__init__(
+        ParameterServerStrategyExtended(self))
+
+
+class ParameterServerStrategyExtended(
+    distribute_lib.DistributionStrategyExtended):
+  """Implementation of ParameterServerStrategy."""
+
+  def __init__(self,
+               container_strategy,
+               cluster_resolver=TFConfigClusterResolver()):
+    super(ParameterServerStrategyExtended, self).__init__(container_strategy)
+    self._initialize_strategy(cluster_resolver)
+
+    # We typically don't need to do all-reduce in this strategy.
+    self._cross_device_ops = (
+        cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps(
+            reduce_to_device=_LOCAL_CPU))
+
+  def _initialize_strategy(self, cluster_resolver):
+    if cluster_resolver.cluster_spec().as_dict():
+      self._initialize_multi_worker(cluster_resolver)
+    else:
+      self._initialize_local(cluster_resolver)
+    # Save the num_gpus_per_worker for configure method.
+    self._num_gpus_per_worker = cluster_resolver.num_accelerators()
+
+  def _initialize_multi_worker(self, cluster_resolver):
+    """Initialize devices for multiple workers.
+
+    It creates variable devices and compute devices. Variables and operations
+    will be assigned to them respectively. We have one compute device per
+    replica. The variable device is a device function or device string. The
+    default variable device assigns variables to parameter servers in a
+    round-robin fashion.
+
+    Args:
+      cluster_resolver: a descendant of `ClusterResolver` object.
+
+    Raises:
+      ValueError: if the cluster doesn't have ps jobs.
+    """
+    num_gpus = cluster_resolver.num_accelerators()
+    cluster_spec = cluster_resolver.cluster_spec()
+    task_type = cluster_resolver.task_type
+    task_id = cluster_resolver.task_index
+    if not task_type or task_id is None:
+      raise ValueError("When `cluster_spec` is given, you must also specify "
+                       "`task_type` and `task_id`")
+    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
+    assert cluster_spec.as_dict()
+
+    worker_device = "/job:%s/task:%d" % (task_type, task_id)
+    self._input_host_device = numpy_dataset.SingleDevice(worker_device)
+
+    # Define compute devices which is a list of device strings and one for each
+    # replica. When there are GPUs, replicate operations on these GPUs.
+    # Otherwise, place operations on CPU.
+    if num_gpus > 0:
+      compute_devices = tuple(
+          "%s/device:GPU:%d" % (worker_device, i) for i in range(num_gpus))
+    else:
+      compute_devices = (worker_device,)
+
+    self._device_map = values.ReplicaDeviceMap(compute_devices)
+    self._input_workers = input_lib.InputWorkers(
+        self._device_map, [(worker_device, compute_devices)])
+
+    # In distributed mode, place variables on ps jobs in a round-robin fashion.
+    # Note that devices returned from `replica_device_setter` are not
+    # canonical and therefore we don't canonicalize all variable devices to
+    # make them consistent.
+    # TODO(yuefengz): support passing a strategy object to control variable
+    # assignment.
+    # TODO(yuefengz): merge the logic of replica_device_setter into this
+    # class.
+    num_ps_replicas = len(cluster_spec.as_dict().get("ps", []))
+    if num_ps_replicas == 0:
+      raise ValueError("The cluster spec needs to have `ps` jobs.")
+    self._variable_device = device_setter.replica_device_setter(
+        ps_tasks=num_ps_replicas,
+        worker_device=worker_device,
+        merge_devices=True,
+        cluster=cluster_spec)
+
+    # The `_parameter_devices` is needed for the `parameter_devices` property
+    # and is a list of all variable devices. Here parameter devices are all
+    # tasks of the "ps" job.
+    self._parameter_devices = tuple(map("/job:ps/task:{}".format,
+                                        range(num_ps_replicas)))
+
+    # Add a default device so that ops without specified devices will not end up
+    # on other workers.
+    self._default_device = worker_device
+
+    self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
+                                                task_id)
+    self._cluster_spec = cluster_spec
+    self._task_type = task_type
+    self._task_id = task_id
+
+    logging.info(
+        "Multi-worker ParameterServerStrategy with "
+        "cluster_spec = %r, task_type = %r, task_id = %r, "
+        "num_ps_replicas = %r, is_chief = %r, device_map = %r, "
+        "variable_device = %r", cluster_spec.as_dict(), task_type, task_id,
+        num_ps_replicas, self._is_chief, self._device_map,
+        self._variable_device)
+
+  def _initialize_local(self, cluster_resolver):
+    """Initialize internal devices for local training."""
+    worker_device = device_util.canonicalize("/device:CPU:0")
+    self._input_host_device = numpy_dataset.SingleDevice(worker_device)
+    num_gpus = cluster_resolver.num_accelerators()
+    # Define compute devices which is a list of device strings and one for each
+    # replica. When there are GPUs, replicate operations on these GPUs.
+    # Otherwise, place operations on CPU.
+    if num_gpus > 0:
+      compute_devices = tuple(map("/device:GPU:{}".format, range(num_gpus)))
+    else:
+      compute_devices = (_LOCAL_CPU,)
+
+    self._device_map = values.ReplicaDeviceMap(compute_devices)
+    self._input_workers = input_lib.InputWorkers(
+        self._device_map, [(worker_device, compute_devices)])
+
+    # If there is only one GPU, put everything on that GPU. Otherwise, place
+    # variables on CPU.
+    if num_gpus == 1:
+      assert len(compute_devices) == 1
+      self._variable_device = _LOCAL_GPU_0
+      self._parameter_devices = (_LOCAL_GPU_0,)
+    else:
+      self._variable_device = _LOCAL_CPU
+      self._parameter_devices = (_LOCAL_CPU,)
+
+    self._is_chief = True
+    self._cluster_spec = None
+    self._task_type = None
+    self._task_id = None
+
+    logging.info(
+        "ParameterServerStrategy with compute_devices = %r, "
+        "variable_device = %r", compute_devices, self._variable_device)
+
+  def _validate_colocate_with_variable(self, colocate_with_variable):
+    values.validate_colocate(colocate_with_variable, self)
+
+  def _distribute_dataset(self, dataset_fn):
+    """Distributes the dataset to each local GPU."""
+    return input_lib.PerReplicaDataset(
+        self._call_dataset_fn(dataset_fn),
+        self._input_workers,
+        0,
+        prefetch_on_device=True)
+
+  def _make_dataset_iterator(self, dataset):
+    return input_lib.DatasetIterator(dataset, self._input_workers,
+                                     self._num_replicas_in_sync)
+
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    """Distributes the dataset to each local GPU."""
+    if self._cluster_spec:
+      input_pipeline_id = multi_worker_util.id_in_cluster(
+          self._cluster_spec, self._task_type, self._task_id)
+      num_input_pipelines = multi_worker_util.worker_count(
+          self._cluster_spec, self._task_type)
+    else:
+      input_pipeline_id = 0
+      num_input_pipelines = 1
+    input_context = distribute_lib.InputContext(
+        num_input_pipelines=num_input_pipelines,
+        input_pipeline_id=input_pipeline_id,
+        num_replicas_in_sync=self._num_replicas_in_sync)
+    return input_lib.InputFunctionIterator(input_fn, self._input_workers,
+                                           [input_context])
+
+  def _experimental_make_numpy_dataset(self, numpy_input, session):
+    return numpy_dataset.one_host_numpy_dataset(
+        numpy_input, self._input_host_device, session)
+
+  def _broadcast_to(self, tensor, destinations):
+    # This is both a fast path for Python constants, and a way to delay
+    # converting Python values to a tensor until we know what type it
+    # should be converted to. Otherwise we have trouble with:
+    #   global_step.assign_add(1)
+    # since the `1` gets broadcast as an int32 but global_step is int64.
+    if isinstance(tensor, (float, int)):
+      return tensor
+    if not cross_device_ops_lib.check_destinations(destinations):
+      # TODO(josh11b): Use current logical device instead of 0 here.
+      destinations = values.LogicalDeviceSpec(
+          device_map=self._device_map, logical_device=0)
+    return self._cross_device_ops.broadcast(tensor, destinations)
+
+  def _allow_variable_partition(self):
+    return not context.executing_eagerly()
+
+  # TODO(yuefengz): not all ops in device_setter.STANDARD_PS_OPS will go through
+  # this creator, such as "MutableHashTable".
+  def _create_variable(self, next_creator, *args, **kwargs):
+    if self._num_replicas_in_sync > 1:
+      aggregation = kwargs.pop("aggregation", vs.VariableAggregation.NONE)
+      if aggregation not in (
+          vs.VariableAggregation.NONE,
+          vs.VariableAggregation.SUM,
+          vs.VariableAggregation.MEAN,
+          vs.VariableAggregation.ONLY_FIRST_REPLICA
+      ):
+        raise ValueError("Invalid variable aggregation mode: " + aggregation +
+                         " for variable: " + kwargs["name"])
+
+      def var_creator(*args, **kwargs):
+        """Create an AggregatingVariable and fix up collections."""
+        # Record what collections this variable should be added to.
+        collections = kwargs.pop("collections", None)
+        if collections is None:
+          collections = [ops.GraphKeys.GLOBAL_VARIABLES]
+        kwargs["collections"] = []
+
+        # Create and wrap the variable.
+        v = next_creator(*args, **kwargs)
+        wrapped = values.AggregatingVariable(
+            self._container_strategy(), v, aggregation)
+
+        # Add the wrapped variable to the requested collections.
+        # The handling of eager mode and the global step matches
+        # ResourceVariable._init_from_args().
+        if not context.executing_eagerly():
+          g = ops.get_default_graph()
+          # If "trainable" is True, next_creator() will add the contained
+          # variable to the TRAINABLE_VARIABLES collection, so we manually
+          # remove it and replace with the wrapper. We can't set "trainable"
+          # to False for next_creator() since that causes functions like
+          # implicit_gradients to skip those variables.
+          if kwargs.get("trainable", True):
+            collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
+            l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
+            l.remove(v)
+          g.add_to_collections(collections, wrapped)
+        elif ops.GraphKeys.GLOBAL_STEP in collections:
+          ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, wrapped)
+
+        return wrapped
+    else:
+      var_creator = next_creator
+
+    if "colocate_with" in kwargs:
+      colocate_with = kwargs["colocate_with"]
+      if isinstance(colocate_with, numpy_dataset.SingleDevice):
+        with ops.device(colocate_with.device):
+          return var_creator(*args, **kwargs)
+      with ops.device(None):
+        with ops.colocate_with(colocate_with):
+          return var_creator(*args, **kwargs)
+
+    with ops.colocate_with(None, ignore_existing=True):
+      with ops.device(self._variable_device):
+        return var_creator(*args, **kwargs)
+
+  def _call_for_each_replica(self, fn, args, kwargs):
+    # pylint: disable=protected-access
+    return mirrored_strategy._call_for_each_replica(
+        self._container_strategy(), self._device_map, fn, args, kwargs)
+
+  def _verify_destinations_not_different_worker(self, destinations):
+    if not self._cluster_spec:
+      return
+    if destinations is None:
+      return
+    for d in cross_device_ops_lib.get_devices_from(destinations):
+      d_spec = tf_device.DeviceSpec.from_string(d)
+      if d_spec.job == self._task_type and d_spec.task != self._task_id:
+        raise ValueError(
+            "Cannot reduce to another worker: %r, current worker is %r" %
+            (d, self._input_workers.worker_devices[0]))
+
+  def _reduce_to(self, reduce_op, value, destinations):
+    self._verify_destinations_not_different_worker(destinations)
+    if not isinstance(value, values.DistributedValues):
+      # pylint: disable=protected-access
+      return cross_device_ops_lib.reduce_non_distributed_value(
+          reduce_op, self._device_map, value, destinations)
+    return self._cross_device_ops.reduce(
+        reduce_op, value, destinations=destinations)
+
+  def _batch_reduce_to(self, reduce_op, value_destination_pairs):
+    for _, destinations in value_destination_pairs:
+      self._verify_destinations_not_different_worker(destinations)
+    return self._cross_device_ops.batch_reduce(reduce_op,
+                                               value_destination_pairs)
+
+  def _select_single_value(self, structured):
+    """Select any single values in `structured`."""
+
+    def _select_fn(x):  # pylint: disable=g-missing-docstring
+      if isinstance(x, values.Mirrored):
+        if len(x.devices) == 1:
+          return x.primary
+        else:
+          raise ValueError(
+              "You cannot update variable with a Mirrored object with multiple "
+              "components %r when using ParameterServerStrategy. You must "
+              "specify a single value or a Mirrored with a single value." % x)
+      elif isinstance(x, values.PerReplica):
+        raise ValueError(
+            "You cannot update variable with a PerReplica object %r when using "
+            "ParameterServerStrategy. You must specify a single value or a "
+            "Mirrored with a single value" % x)
+      else:
+        return x
+
+    return nest.map_structure(_select_fn, structured)
+
+  def _update(self, var, fn, args, kwargs, group):
+    if isinstance(var, values.AggregatingVariable):
+      var = var.get()
+    if not isinstance(var, resource_variable_ops.ResourceVariable):
+      raise ValueError(
+          "You can not update `var` %r. It must be a Variable." % var)
+    with ops.colocate_with(var), distribute_lib.UpdateContext(var.device):
+      result = fn(var, *self._select_single_value(args),
+                  **self._select_single_value(kwargs))
+      if group:
+        return result
+      else:
+        return nest.map_structure(self._unwrap, result)
+
+  # TODO(yuefengz): does it need to call _select_single_value?
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
+    with ops.device(
+        colocate_with.device), distribute_lib.UpdateContext(colocate_with):
+      result = fn(*args, **kwargs)
+      if group:
+        return result
+      else:
+        return nest.map_structure(self._unwrap, result)
+
+  def _unwrap(self, val):
+    if isinstance(val, values.DistributedValues):
+      return val.values
+    return (val,)
+
+  def value_container(self, val):
+    if (hasattr(val, "_aggregating_container") and
+        not isinstance(val, values.AggregatingVariable)):
+      wrapper = val._aggregating_container()  # pylint: disable=protected-access
+      if wrapper is not None:
+        return wrapper
+    return val
+
+  def read_var(self, var):
+    # No need to distinguish between normal variables and replica-local
+    # variables.
+    return array_ops.identity(var)
+
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
+    """Configures the strategy class.
+
+    The strategy object will be re-initialized if `cluster_spec` is given but
+    was not passed in the constructor.
+
+    Args:
+      session_config: not used currently.
+      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
+        cluster configurations.
+      task_type: the current task type.
+      task_id: the current task id.
+
+    Raises:
+      ValueError: if `cluster_spec` is given but `task_type` or `task_id` is
+        not.
+    """
+    if cluster_spec:
+      # Use the num_gpus_per_worker recorded in constructor since _configure
+      # doesn't take num_gpus.
+      cluster_resolver = SimpleClusterResolver(
+          cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
+          task_type=task_type,
+          task_index=task_id,
+          num_accelerators=self._num_gpus_per_worker)
+      self._initialize_multi_worker(cluster_resolver)
+
+    if session_config:
+      session_config.CopyFrom(self._update_config_proto(session_config))
+
+  def _update_config_proto(self, config_proto):
+    updated_config = copy.deepcopy(config_proto)
+    if not self._cluster_spec:
+      updated_config.isolate_session_state = True
+      return updated_config
+
+    updated_config.isolate_session_state = False
+
+    assert self._task_type
+    assert self._task_id is not None
+
+    # The device filters prevent communication between workers.
+    if self._task_type not in ["chief", "worker"]:
+      return updated_config
+    del updated_config.device_filters[:]
+    updated_config.device_filters.extend(
+        ["/job:%s/task:%d" % (self._task_type, self._task_id), "/job:ps"])
+    return updated_config
+
+  @property
+  def _num_replicas_in_sync(self):
+    return self._device_map.num_replicas_in_graph
+
+  @property
+  def worker_devices(self):
+    return self._device_map.all_devices
+
+  @property
+  def worker_devices_by_replica(self):
+    return self._device_map.devices_by_replica
+
+  @property
+  def parameter_devices(self):
+    return self._parameter_devices
+
+  def non_slot_devices(self, var_list):
+    return min(var_list, key=lambda x: x.name)
+
+  @property
+  def experimental_between_graph(self):
+    # TODO(yuefengz): Should this return False in the local case?
+    return True
+
+  @property
+  def experimental_should_init(self):
+    return self._is_chief
+
+  @property
+  def should_checkpoint(self):
+    return self._is_chief
+
+  @property
+  def should_save_summary(self):
+    return self._is_chief
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    """`make_dataset_iterator` and `make_numpy_iterator` use global batch size.
+
+    `distribute_dataset` and `make_input_fn_iterator` assume per-replica
+    batching.
+
+    Returns:
+      Boolean.
+    """
+    return True
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 1f5077a75ae1e460bca24172b4daa01cb74c7648..2a57d5cfc77176ec13040d56eb0f33bfd0c4692b 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -23,17 +23,12 @@ import contextlib
 import weakref
 import six
 
-from tensorflow.python.data.experimental.ops import batching
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import multi_device_iterator_ops
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context
-from tensorflow.python.distribute import input_ops
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
-from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -416,11 +411,11 @@ def _assign_on_device(device, variable, tensor):
 
 
 def _assert_strategy(strategy):
-  if not distribution_strategy_context.has_distribution_strategy():
+  if not distribution_strategy_context.has_strategy():
     raise RuntimeError(
         'Need to be inside "with strategy.scope()" for %s' %
         (strategy,))
-  current_strategy = distribution_strategy_context.get_distribution_strategy()
+  current_strategy = distribution_strategy_context.get_strategy()
   if current_strategy is not strategy:
     raise RuntimeError(
         "Mixing different tf.distribute.Strategy objects: %s is not %s" %
@@ -1409,679 +1404,6 @@ def update_regroup(extended, device_map, updates, group):
   return nest.pack_sequence_as(regrouped, grouped_flat)
 
 
-class InputWorkers(object):
-  """A 1-to-many mapping from input worker devices to compute devices."""
-
-  def __init__(self, device_map, worker_device_pairs=None, logical_device=0):
-    """Initialize an `InputWorkers` object.
-
-    Args:
-      device_map: A `DeviceMap` with the computation devices fed by the
-        input workers.
-      worker_device_pairs: A sequence of pairs:
-        `(input device, a tuple of compute devices fed by that input device)`.
-      logical_device: The logical device of `device_map` to feed.
-    """
-    self._device_map = device_map
-    self._logical_device = logical_device
-    if worker_device_pairs is None:
-      worker_device_pairs = ((
-          device_util.canonicalize("/device:CPU:0"),
-          device_map.logical_to_actual_devices(logical_device)),)
-    self._input_worker_devices = tuple(d for d, _ in worker_device_pairs)
-    self._fed_devices = tuple(tuple(device_util.canonicalize(d) for d in f)
-                              for _, f in worker_device_pairs)
-    flattened = tuple(d for l in self._fed_devices for d in l)
-    assert (flattened ==
-            device_map.logical_to_actual_devices(logical_device)), (
-                "flattened: %s logical device %d: %s" %
-                (flattened, logical_device,
-                 device_map.logical_to_actual_devices(logical_device)))
-
-  @property
-  def device_map(self):
-    return self._device_map
-
-  @property
-  def logical_device(self):
-    return self._logical_device
-
-  @property
-  def num_workers(self):
-    return len(self._input_worker_devices)
-
-  @property
-  def worker_devices(self):
-    return self._input_worker_devices
-
-  def compute_devices_for_worker(self, worker_index):
-    return self._fed_devices[worker_index]
-
-  def __repr__(self):
-    devices = self.worker_devices
-    debug_repr = ",\n".join("  %d %s: %s" %
-                            (i, devices[i], self._fed_devices[i])
-                            for i in range(len(devices)))
-    return "%s:{\n%s\n  device_map: %s}" % (
-        self.__class__.__name__, debug_repr, self._device_map)
-
-
-class PerReplicaDataIterator(object):
-  """An iterator (like `tf.data.Iterator`) into a `PerReplicaDataset`."""
-
-  def __init__(self, iterator, input_workers, worker_index, prefetch_on_device):
-    assert isinstance(input_workers, InputWorkers)
-    self._iterator = iterator
-    self._input_workers = input_workers
-    self._worker_index = worker_index
-    self._prefetch_on_device = prefetch_on_device
-
-  @property
-  def initializer(self):
-    return self._iterator.initializer
-
-  def get_next_as_list(self, name=None):
-    """Scatter the input across devices."""
-    if self._prefetch_on_device:
-      data_list = self._iterator.get_next()
-    else:
-      batch = self._iterator.get_next(name=name)
-      data_list = []
-      def get_ith(i):
-        return lambda x: x[i]
-
-      devices = self._input_workers.compute_devices_for_worker(
-          self._worker_index)
-      for i, d in enumerate(devices):
-        v = nest.map_structure(get_ith(i), batch)
-        if context.executing_eagerly():
-          with ops.device(d):
-            v = nest.map_structure(array_ops.identity, v)
-        data_list.append(v)
-
-    return data_list
-
-  def get_next(self, name=None):
-    assert self._input_workers.num_workers == 1
-    data_list = self.get_next_as_list(name)
-    return regroup(self._input_workers.device_map, data_list)
-
-  @property
-  def output_classes(self):
-    return self._iterator.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._iterator.output_shapes
-
-  @property
-  def output_types(self):
-    return self._iterator.output_types
-
-
-class PerReplicaDataset(object):
-  """Like `tf.data.Dataset` split devices, producing `PerReplica` data."""
-
-  def __init__(self, dataset, input_workers, worker_index,
-               prefetch_on_device=None):
-    assert isinstance(input_workers, InputWorkers)
-    assert worker_index is not None
-    assert worker_index is not True
-    assert worker_index is not False
-    self._input_workers = input_workers
-    self._worker_index = worker_index
-
-    # Default to using prefetching, unless specified.
-    self._prefetch_on_device = prefetch_on_device
-    if self._prefetch_on_device is None:
-      self._prefetch_on_device = True
-
-    self._dataset = dataset
-    if not self._prefetch_on_device:
-      # TODO(priyag): If dropping remainder is not appropriate, find another
-      # approach to distributing the dataset when not possible to divide evenly.
-      # Possibly not an issue when we start using PartitionedDataset.
-      num_replicas = len(
-          self._input_workers.compute_devices_for_worker(self._worker_index))
-      self._dataset = self._dataset.batch(num_replicas, drop_remainder=True)
-    else:
-      self._replica_devices = self._input_workers.compute_devices_for_worker(
-          self._worker_index)
-
-  def make_one_shot_iterator(self):
-    """Get a one time use iterator for the distributed PerReplicaDataset."""
-    # Graph mode with one shot iterator is disabled.
-    if not context.executing_eagerly():
-      raise ValueError("Cannot create a one shot iterator. Please use "
-                       "`make_initializable_iterator()` instead.")
-    if self._prefetch_on_device:
-      dataset_iterator = multi_device_iterator_ops.MultiDeviceIterator(
-          self._dataset, self._replica_devices)
-    else:
-      dataset_iterator = dataset_ops.make_one_shot_iterator(self._dataset)
-    return PerReplicaDataIterator(
-        dataset_iterator,
-        self._input_workers,
-        self._worker_index,
-        prefetch_on_device=self._prefetch_on_device)
-
-  def make_initializable_iterator(self):
-    """Get an initializable iterator for the distributed PerReplicaDataset."""
-    # Eager mode generates already initialized iterators. Hence we cannot create
-    # an initializable iterator.
-    if context.executing_eagerly():
-      raise ValueError("Cannot create initializable iterator in Eager mode. "
-                       "Please use `make_one_shot_iterator` instead.")
-    if self._prefetch_on_device:
-      dataset_iterator = multi_device_iterator_ops.MultiDeviceIterator(
-          self._dataset, self._replica_devices)
-    else:
-      dataset_iterator = dataset_ops.make_initializable_iterator(self._dataset)
-    return PerReplicaDataIterator(
-        dataset_iterator, self._input_workers, self._worker_index,
-        prefetch_on_device=self._prefetch_on_device)
-
-
-class MultiWorkerDataIterator(object):
-  """An iterator (like `tf.data.Iterator`) into a `MultiWorkerDataset`."""
-
-  def __init__(self, iterators, input_workers):
-    """Initialize the `MultiWorkerDataIterator` object.
-
-    Args:
-      iterators: a list of worker, iterator pairs.
-      input_workers: an `InputWorkers` object.
-
-    Raises:
-      ValueError: if iterators and input_workers are not compatible.
-    """
-    assert isinstance(input_workers, InputWorkers)
-    workers = tuple(d for d, _ in iterators)
-    if workers != input_workers.worker_devices:
-      raise ValueError("iterators and input_workers are not compatible. "
-                       "iterator workers: %r input_workers devices: %r" %
-                       (workers, input_workers.worker_devices))
-    self._iterators = tuple(i for _, i in iterators)
-    self._input_workers = input_workers
-
-  @property
-  def initializer(self):
-    return control_flow_ops.group(
-        tuple(iterator.initializer for iterator in self._iterators))
-
-  def get_iterator(self, worker):
-    for i, w in enumerate(self._input_workers.worker_devices):
-      if worker == w:
-        return self._iterators[i]
-    return None
-
-  @property
-  def output_shapes(self):
-    return self._iterators[0].output_shapes
-
-  @property
-  def output_types(self):
-    return self._iterators[0].output_types
-
-  def get_next(self, name=None):
-    """Scatter the input across hosts and devices."""
-    replicas = []
-    for worker, iterator in zip(self._input_workers.worker_devices,
-                                self._iterators):
-      if name is not None:
-        d = tf_device.DeviceSpec.from_string(worker)
-        new_name = "%s_%s_%d" % (name, d.job, d.task)
-      else:
-        new_name = None
-      with ops.device(worker):
-        data_per_worker = iterator.get_next_as_list(name=new_name)
-        # Append to replicas to get a flat list of values indexed by replica.
-        replicas.extend(data_per_worker)
-
-    return regroup(self._input_workers.device_map, replicas)
-
-
-class MultiWorkerDataset(object):
-  """Like a `tf.data.Dataset` that distributes data to different workers.
-
-  Each worker gets one shard of the input dataset. This currently does not work
-  in eager mode.
-  """
-
-  def __init__(self, dataset_fn, input_workers, prefetch_on_device=None,
-               auto_shard=False):
-    """Initialize the MultiWorkerDataset object.
-
-    Args:
-      dataset_fn: a function or a list of functions that returns a
-        `tf.data.Dataset`.
-      input_workers: an `InputWorkers` object.
-      prefetch_on_device: whether to prefetch to devices.
-      auto_shard: whether to auto-shard the dataset.
-    """
-    assert isinstance(input_workers, InputWorkers)
-    if isinstance(dataset_fn, (list, tuple)):
-      if len(dataset_fn) != input_workers.num_workers:
-        raise ValueError("If `dataset_fn` is a list, it must have one entry "
-                         "per worker")
-    # TODO(rohanj): b/120673685 to track re-enabling auto sharding.
-    if auto_shard:
-      raise ValueError("Currently autosharding is not supported.")
-    self._input_workers = input_workers
-    self._datasets = []
-    # TODO(yuefengz, priyag): support different set of jobs for input
-    # processing.
-    for i, worker in enumerate(input_workers.worker_devices):
-      with ops.device(worker):
-        if isinstance(dataset_fn, (list, tuple)):
-          worker_input = dataset_fn[i]()
-        else:
-          worker_input = dataset_fn()
-        dataset = PerReplicaDataset(worker_input, input_workers, i,
-                                    prefetch_on_device=prefetch_on_device)
-        self._datasets.append((worker, dataset))
-
-  def make_one_shot_iterator(self):
-    iterators = []
-    for worker, dataset in self._datasets:
-      with ops.device(worker):
-        iterators.append((worker, dataset_ops.make_one_shot_iterator(dataset)))
-    return MultiWorkerDataIterator(iterators, self._input_workers)
-
-  def make_initializable_iterator(self):
-    iterators = []
-    for worker, dataset in self._datasets:
-      with ops.device(worker):
-        iterators.append(
-            (worker, dataset_ops.make_initializable_iterator(dataset)))
-    return MultiWorkerDataIterator(iterators, self._input_workers)
-
-
-class InputIterator(object):
-  """An input iterator, intended to be passed to `DistributionStrategy.run`."""
-
-  def get_next(self):
-    """Returns the next inputs for all replicas."""
-    raise NotImplementedError("must be implemented in descendants")
-
-  def initialize(self):
-    """Initialize the underlying input dataset, when applicable.
-
-    In eager mode, this will create a new iterator and return it.
-    In graph mode, this will initialize the same underlying iterator(s).
-
-    Users are required to call this if
-    - This iterator was returned from a call to `make_input_fn_iterator` with an
-      input function that returns a dataset.
-    - Or this iterator was returned from a call to `make_dataset_iterator`.
-
-    Returns:
-      A list of initialization ops to be executed.
-    """
-    raise NotImplementedError("must be implemented in descendants")
-
-
-class InputIteratorImpl(InputIterator):
-  """Common implementation for all input iterators."""
-
-  def __init__(self, input_workers, iterators):
-    assert isinstance(input_workers, InputWorkers)
-    if not input_workers.worker_devices:
-      raise ValueError("Should have at least one worker for input iterator.")
-
-    self._iterators = iterators
-    self._input_workers = input_workers
-
-  def get_next(self, name=None):
-    """Returns the next input from the iterator for all replicas."""
-    replicas = []
-    for i, worker in enumerate(self._input_workers.worker_devices):
-      if name is not None:
-        d = tf_device.DeviceSpec.from_string(worker)
-        new_name = "%s_%s_%d" % (name, d.job, d.task)
-      else:
-        new_name = None
-      with ops.device(worker):
-        # Make `replicas` a flat list of values across all replicas.
-        replicas.extend(self._iterators[i].get_next_as_list(new_name))
-
-    return regroup(self._input_workers.device_map, replicas)
-
-  def initialize(self):
-    """Initialze underlying iterators.
-
-    Returns:
-      A list of any initializer ops that should be run.
-    """
-    init_ops = []
-    for it in self._iterators:
-      init_ops.extend(it.initialize())
-    return init_ops
-
-  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
-  @property
-  def output_classes(self):
-    return self._iterators[0].output_classes
-
-  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
-  @property
-  def output_shapes(self):
-    return self._iterators[0].output_shapes
-
-  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
-  @property
-  def output_types(self):
-    return self._iterators[0].output_types
-
-  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
-  def get_iterator(self, worker):
-    for i, w in enumerate(self._input_workers.worker_devices):
-      if worker == w:
-        return self._iterators[i]
-    return None
-
-
-class InputFunctionIterator(InputIteratorImpl):
-  """Iterator created from input function."""
-
-  def __init__(self, input_fn, input_workers, input_contexts):
-    """Make an iterator for input provided via an input function.
-
-    Currently implements PER_WORKER mode, in which the `input_fn` is called
-    once on each worker.
-
-    TODO(priyag): Add other replication modes.
-    TODO(priyag): Allow taking input function that returns a callable that
-    returns nest of tensors.
-
-    Args:
-      input_fn: Input function that returns a `tf.data.Dataset` object.
-      input_workers: an `InputWorkers` object.
-      input_contexts: A list of `InputContext` instances to be passed to call(s)
-        to `input_fn`. Length and order should match worker order in
-        `worker_device_pairs`.
-    """
-    assert isinstance(input_workers, InputWorkers)
-    if input_workers.num_workers != len(input_contexts):
-      raise ValueError(
-          "Number of input workers (%d) is not same as number of "
-          "input_contexts (%d)" %
-          (input_workers.num_workers, len(input_contexts)))
-
-    iterators = []
-    for i, ctx in enumerate(input_contexts):
-      worker = input_workers.worker_devices[i]
-      with ops.device(worker):
-        result = input_fn(ctx)
-        if not isinstance(result, dataset_ops.DatasetV2):
-          raise ValueError("input_fn must return a tf.data.Dataset.")
-        devices = input_workers.compute_devices_for_worker(i)
-        iterator = _SingleWorkerDatasetIterator(result, worker, devices)
-        iterators.append(iterator)
-
-    super(InputFunctionIterator, self).__init__(input_workers, iterators)
-
-
-class DatasetIterator(InputIteratorImpl):
-  """Iterator created from input dataset."""
-
-  def __init__(self, dataset, input_workers, split_batch_by=None):
-    """Make an iterator for the dataset on given devices.
-
-    If `split_batch_by` is not None, we "split" each batch of the
-    dataset by `split_batch_by` value. To achieve this, we first unbatch the
-    input dataset and then rebatch it with the per replica batch size that is
-    calculated using `global_batch_size // split_batch_by`.
-    The currently supported datasets are as follows:
-    `dataset.batch()` is the last operation on the dataset OR
-    `dataset.apply(map_and_batch)` is the last operation on the dataset OR
-    `dataset.batch().prefetch()` are the last 2 operations on the dataset OR
-    `dataset.apply(map_and_batch).prefetch()` are the last 2 operations.
-
-    TODO(priyag): Support multi worker / host cases properly by cloning
-    and sharding the dataset on each worker. Current setup will only work in
-    some cases, such as in-graph multi worker GPU case. If the input pipeline
-    has random shuffling (with a different seed on each worker), each worker
-    will see random input from the same overall dataset in each step. Otherwise,
-    each worker will see the same input in each step.
-
-    Args:
-      dataset: `tf.data.Dataset` that will be used as the input source.
-      input_workers: an `InputWorkers` object.
-      split_batch_by: Optional integer. If present, we "split" each batch of the
-        dataset by `split_batch_by` value.
-    """
-    assert isinstance(input_workers, InputWorkers)
-    if split_batch_by:
-      dataset = _split_dataset_batch(dataset, split_batch_by)
-
-    iterators = []
-    for i, worker in enumerate(input_workers.worker_devices):
-      with ops.device(worker):
-        worker_devices = input_workers.compute_devices_for_worker(i)
-        cloned_dataset = dataset
-        if not context.executing_eagerly():
-          cloned_dataset = input_ops._clone_dataset(dataset)  # pylint: disable=protected-access
-        iterator = _SingleWorkerDatasetIterator(cloned_dataset, worker,
-                                                worker_devices)
-        iterators.append(iterator)
-
-    super(DatasetIterator, self).__init__(input_workers, iterators)
-
-
-class _SingleWorkerDatasetIterator(object):
-  """Iterator for a single `tf.data.Dataset`."""
-
-  def __init__(self, dataset, worker, devices):
-    """Create iterator for the `dataset` to fetch data to worker's `devices` .
-
-    `MultiDeviceIterator` is used to prefetch input to the devices on the
-    given worker.
-
-    Args:
-      dataset: A `tf.data.Dataset` instance.
-      worker: Worker on which ops should be created.
-      devices: Distribute data from `dataset` to these devices.
-    """
-    self._dataset = dataset
-    self._worker = worker
-    self._devices = devices
-    self._make_iterator()
-
-  def _make_iterator(self):
-    """Make appropriate iterator on the dataset."""
-    with ops.device(self._worker):
-      self._iterator = multi_device_iterator_ops.MultiDeviceIterator(
-          self._dataset, self._devices)
-
-  def get_next_as_list(self, name=None):
-    """Get next element from the underlying iterator."""
-    del name
-    with ops.device(self._worker):
-      data_list = self._iterator.get_next()
-      return data_list
-
-  def initialize(self):
-    """Initialze underlying iterator.
-
-    In eager execution, this simply recreates the underlying iterator.
-    In graph execution, it returns the initializer ops for the underlying
-    iterator.
-
-    Returns:
-      A list of any initializer ops that should be run.
-    """
-    if context.executing_eagerly():
-      self._make_iterator()
-      return []
-    else:
-      return [self._iterator.initializer]
-
-  @property
-  def output_classes(self):
-    return self._iterator.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._iterator.output_shapes
-
-  @property
-  def output_types(self):
-    return self._iterator.output_types
-
-
-def _split_dataset_batch(dataset, split_batch_by):
-  """Divide a batch-ed dataset's batches into smaller batches."""
-  # TODO(sourabhbajaj): Remove this in lieu of distributed datasets
-  # pylint: disable=protected-access
-  def _get_batch_dataset(d):
-    """Get the underlying batch dataset from the dataset object."""
-    if isinstance(d, dataset_ops.DatasetV1Adapter):
-      d = d._dataset
-
-    if isinstance(d, (dataset_ops.BatchDataset, batching._MapAndBatchDataset)):
-      return d
-    elif isinstance(d, dataset_ops.PrefetchDataset):
-      return _get_batch_dataset(d._input_dataset)
-    raise ValueError(
-        "Unable to get batched dataset from the input dataset. `batch` "
-        "`map_and_batch` need to be the last operations on the dataset. "
-        "The batch operations can be followed by a prefetch.")
-
-  batched_dataset = _get_batch_dataset(dataset)
-  if isinstance(batched_dataset, dataset_ops.BatchDataset):
-    batch_size = batched_dataset._batch_size
-    drop_remainder = batched_dataset._drop_remainder
-  elif isinstance(batched_dataset, batching._MapAndBatchDataset):
-    batch_size = batched_dataset._batch_size_t
-    drop_remainder = batched_dataset._drop_remainder_t
-
-  prefetch_buffer = None
-  if isinstance(dataset, dataset_ops.PrefetchDataset):
-    prefetch_buffer = dataset._buffer_size
-  elif (isinstance(dataset, dataset_ops.DatasetV1Adapter)
-        and isinstance(dataset._dataset, dataset_ops.PrefetchDataset)):
-    prefetch_buffer = dataset._dataset._buffer_size
-  # pylint: enable=protected-access
-
-  if tensor_util.is_tensor(batch_size):
-    batch_size = tensor_util.constant_value(batch_size)
-
-  if tensor_util.is_tensor(drop_remainder):
-    drop_remainder = tensor_util.constant_value(drop_remainder)
-
-  if batch_size % split_batch_by:
-    raise ValueError(
-        "Batch size %s cannot be sharded evenly across replicas %s" % (
-            batch_size, split_batch_by))
-  new_batch_size = batch_size // split_batch_by
-
-  dataset = dataset.apply(batching.unbatch())
-  dataset = dataset.batch(new_batch_size, drop_remainder=drop_remainder)
-  if prefetch_buffer is not None:
-    dataset = dataset.prefetch(prefetch_buffer)
-  return dataset
-
-
-class MultiStepContext(object):
-  """A context object that can be used to capture things when running steps.
-
-  This context object is useful when running multiple steps at a time using the
-  `experimental_run_steps_on_iterator` API. For e.g. it allows the user's step
-  function to specify which outputs to emit at what frequency. Currently it
-  supports capturing output from the last step, as well as capturing non tensor
-  outputs.  In the future it will be augmented to support other use cases such
-  as output each N steps.
-  """
-
-  def __init__(self):
-    """Initialize an output context.
-
-    Returns:
-      A context object.
-    """
-    self._last_step_outputs = {}
-    self._last_step_outputs_reduce_ops = {}
-    self._non_tensor_outputs = {}
-
-  @property
-  def last_step_outputs(self):
-    """A dictionary consisting of outputs to be captured on last step.
-
-    Keys in the dictionary are names of tensors to be captured, as specified
-    when `set_last_step_output` is called.
-    Values in the dictionary are the tensors themselves. If
-    `set_last_step_output` was called with a `reduce_op` for this output,
-    then the value is the reduced value.
-
-    Returns:
-      A dictionary with last step outputs.
-    """
-    return self._last_step_outputs
-
-  def _set_last_step_outputs(self, outputs):
-    """Replace the entire dictionary of last step outputs."""
-    if not isinstance(outputs, dict):
-      raise ValueError("Need a dictionary to set last_step_outputs.")
-    self._last_step_outputs = outputs
-
-  def set_last_step_output(self, name, output, reduce_op=None):
-    """Set `output` with `name` to be outputted from the last step.
-
-    Args:
-      name: String, name to identify the output. Doesn't need to match tensor
-        name.
-      output: The tensors that should be outputted with `name`. See below for
-        actual types supported.
-      reduce_op: Reduction method to use to reduce outputs from multiple
-        replicas. Required if `set_last_step_output` is called in a replica
-        context. Optional in cross_replica_context.
-        When present, the outputs from all the replicas are reduced using the
-        current distribution strategy's `reduce` method. Hence, the type of
-        `output` must be what's supported by the corresponding `reduce` method.
-        For e.g. if using MirroredStrategy and reduction is set, output
-        must be a `PerReplica` value.
-        The reduce method is also recorded in a dictionary
-        `_last_step_outputs_reduce_ops` for later interpreting of the
-        outputs as already reduced or not.
-    """
-    if distribution_strategy_context.in_cross_replica_context():
-      self._last_step_outputs_reduce_ops[name] = reduce_op
-      if reduce_op is None:
-        self._last_step_outputs[name] = output
-      else:
-        distribution = distribution_strategy_context.get_distribution_strategy()
-        self._last_step_outputs[name] = distribution.reduce(reduce_op, output)
-    else:
-      assert reduce_op is not None
-      def merge_fn(distribution, value):
-        self._last_step_outputs[name] = distribution.reduce(reduce_op, value)
-        # Setting this inside the `merge_fn` because all replicas share the same
-        # context object, so it's more robust to set it only once (even if all
-        # the replicas are trying to set the same value).
-        self._last_step_outputs_reduce_ops[name] = reduce_op
-
-      distribution_strategy_context.get_replica_context().merge_call(
-          merge_fn, args=(output,))
-
-  @property
-  def non_tensor_outputs(self):
-    """A dictionary consisting of any non tensor outputs to be captured."""
-    return self._non_tensor_outputs
-
-  def set_non_tensor_output(self, name, output):
-    """Set `output` with `name` to be captured as a non tensor output."""
-    if distribution_strategy_context.in_cross_replica_context():
-      self._non_tensor_outputs[name] = output
-    else:
-      def merge_fn(distribution, value):
-        # NOTE(priyag): For non tensor outputs, we simply return all the values
-        # in a list as reduction doesn't make sense on non tensors.
-        self._non_tensor_outputs[name] = distribution.unwrap(value)
-      distribution_strategy_context.get_replica_context().merge_call(
-          merge_fn, args=(output,))
-
-
 def value_container(val):
   """Returns the container that this per-replica `value` belongs to.
 
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index cd5c0be283eea729574614032817632de6b86fff..1f1cb22d58fab05c5c8c8478cdcf1f304521d878 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -1,6 +1,6 @@
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "py_test", "tf_cc_binary")
+load("//tensorflow:tensorflow.bzl", "tf_py_test", "tf_cc_binary")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load(
     "//tensorflow/tools/test:performance.bzl",
@@ -255,11 +255,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "execution_callbacks_test",
     srcs = ["execution_callbacks_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":execution_callbacks",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
@@ -373,11 +372,10 @@ tf_py_logged_benchmark(
     target = "//tensorflow/python/eager:benchmarks_test",
 )
 
-py_test(
+tf_py_test(
     name = "tape_test",
     srcs = ["tape_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":backprop",
         ":context",
         ":test",
@@ -414,20 +412,19 @@ cuda_py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "pywrap_tfe_test",
     srcs = ["pywrap_tfe_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":backprop",
         ":context",
         ":core",
         ":test",
+        "//third_party/py/numpy",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:random_ops",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -491,11 +488,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "def_function_test",
     srcs = ["def_function_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":def_function",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -519,11 +515,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "wrap_function_test",
     srcs = ["wrap_function_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":wrap_function",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 7d99f6e95a4d4a71e6f3032c27efcf5b8c889037..42db726a7bb576e7899845cdd94bb21af1eef67a 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -930,11 +930,12 @@ class GradientTape(object):
                             "gradient in order to compute higher order "
                             "derrivatives.", 1)
 
-    flat_targets = nest.flatten(target)
-    for t in flat_targets:
+    flat_targets = []
+    for t in nest.flatten(target):
       if resource_variable_ops.is_resource_variable(t):
-        raise ValueError("GradientTape.gradient is not supported for variable "
-                         "targets.")
+        with self:
+          t = ops.convert_to_tensor(t)
+      flat_targets.append(t)
 
     flat_sources = nest.flatten(sources)
     flat_sources = [_handle_or_self(x) for x in flat_sources]
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 22ae6f74cb6a5fa0a3a9ab16b516b8798291f4b8..5f4fda8897b3913ffeb165819a4b7859821ec3b8 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -354,6 +354,16 @@ class BackpropTest(test.TestCase):
       loss += v * v
     self.assertAllEqual(t.gradient(loss, v), 2.0)
 
+  def testPythonMax(self):
+    x = [resource_variable_ops.ResourceVariable(2.),
+         resource_variable_ops.ResourceVariable(3.),
+         resource_variable_ops.ResourceVariable(5.)]
+    with backprop.GradientTape() as t:
+      f = max(x)
+    grad = t.gradient(f, x)
+    self.assertAllEqual(self.evaluate(f), 5.)
+    self.assertAllEqual(self.evaluate(grad), [None, None, 1.0])
+
   def testAutomaticWatchedVariables(self):
     with backprop.GradientTape() as t:
       self.assertEqual(0, len(t.watched_variables()))
@@ -674,10 +684,8 @@ class BackpropTest(test.TestCase):
     with backprop.GradientTape() as g:
       x = variables.Variable([3.0])
       y = variables.Variable([2.0])
-    with self.assertRaisesRegexp(
-        ValueError,
-        'GradientTape.gradient is not supported for variable targets.'):
-      g.gradient(x, y)
+    grad = g.gradient(x, y)
+    self.assertAllEqual(grad, None)
 
   @test_util.run_in_graph_and_eager_modes
   @test_util.run_v1_only('b/120545219')
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 31a7efca82b016bc193ab9985ea7603897edc7ac..62c4a12cbfade450cf7c2acff2ec4d14c30ab1aa 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -140,7 +140,7 @@ class MicroBenchmarks(test.Benchmark):
     self._m_2_by_2 = random_ops.random_uniform((2, 2))
     self._m_100_by_784 = random_ops.random_uniform((100, 784))
     self._num_iters_2_by_2 = 30000
-    self._num_iters_100_by_784 = 1000
+    self._num_iters_100_by_784 = 30000
 
   def _run(self, func, num_iters, execution_mode=None):
     # call func to maybe warm up the GPU
@@ -370,6 +370,19 @@ class MicroBenchmarks(test.Benchmark):
     func = lambda: f(m, m, transpose_b=transpose_b)
     self._run(func, num_iters, execution_mode=execution_mode)
 
+  def _benchmark_nested_defun_matmul(self, m, transpose_b, num_iters):
+    inner = function.defun(math_ops.matmul)
+
+    @function.defun
+    def outer(a, b, c, transpose_b):
+      return math_ops.matmul(inner(a, b, transpose_b=transpose_b), c)
+
+    func = lambda: outer(m, m, m, transpose_b=transpose_b)
+    # Warmup before benchmark
+    for _ in range(1000):
+      func()
+    self._run(func, num_iters)
+
   def _benchmark_defun_matmul_forward_backward(self,
                                                m,
                                                transpose_b,
@@ -525,6 +538,11 @@ class MicroBenchmarks(test.Benchmark):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
+  def benchmark_nested_defun_matmul_2_by_2(self):
+    m = self._m_2_by_2.cpu()
+    self._benchmark_nested_defun_matmul(
+        m, transpose_b=False, num_iters=self._num_iters_2_by_2)
+
   # Benchmarks for AA.T, A of dimension 100 by 784.
   def benchmark_np_matmul_100_by_784(self):
     self._benchmark_np_matmul(
@@ -614,6 +632,11 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_defun_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
+  def benchmark_nested_defun_matmul_100_by_784(self):
+    m = self._m_100_by_784.gpu()
+    self._benchmark_nested_defun_matmul(
+        m, transpose_b=True, num_iters=self._num_iters_100_by_784)
+
   def benchmark_defun_without_signature(self):
 
     def func(t1, t2, t3, t4, t5, t6, t7, t8):
@@ -867,6 +890,10 @@ class MicroBenchmarks(test.Benchmark):
     self._run(scan, 100)
 
   def benchmarkScanDefun(self):
+    if context.num_gpus():
+      # TODO(b/122081934): Re-enable this after figuring out why this became
+      # really slow with control flow V2
+      return
     elems = math_ops.range(1600)
 
     @function.defun
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index cd43dc7ab298bb3bed6128799bf22804f0cdc3d3..faaf742c26397afc6468d4e6f4627df5895a324e 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -788,6 +788,27 @@ def in_eager_mode():
   return executing_eagerly()
 
 
+def shared_name(name=None):
+  """Returns the anonymous shared name GUID if no shared name is specified.
+
+  In eager mode we need to use a unique shared name to avoid spurious sharing
+  issues. The runtime generates a unique name on our behalf when the reserved
+  GUID is used as a shared name.
+
+  Args:
+    name: Optional shared name
+
+  Returns:
+    Eager compatible shared name.
+  """
+  if name or not executing_eagerly():
+    return name
+
+  # Ensure a unique name when eager execution is enabled to avoid spurious
+  # sharing issues.
+  return "cd2c89b7-88b7-44c8-ad83-06c2a9158347"
+
+
 def graph_mode():
   """Context-manager to disable eager execution for the current thread."""
   return context()._mode(GRAPH_MODE)  # pylint: disable=protected-access
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index ebc47d156691abf6cb3d0894ca11647fb912cda7..4c22a12c1c078c9d9992d5d0c4a6c609eda48584 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -130,8 +130,9 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
                         if init_from_fn else [initial_value]) as name:
       # pylint: disable=protected-access
       with ops.init_scope():
-        shared_name = ops._name_from_scope_name(name)
-        shared_name = "%s_%d" % (shared_name, ops.uid())
+        handle_name = ops._name_from_scope_name(name)
+        unique_id = "%s_%d" % (handle_name, ops.uid())
+        shared_name = context.shared_name(unique_id)
       with ops.name_scope("Initializer"), ops.device(None):
         initial_value = ops.convert_to_tensor(
             initial_value() if init_from_fn else initial_value,
@@ -144,8 +145,8 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
             name=name,
             graph_mode=self._in_graph_mode)
       self._shape = initial_value.shape
-      self._unique_id = shared_name
-      self._handle_name = shared_name + ":0"
+      self._unique_id = unique_id
+      self._handle_name = handle_name + ":0"
       self._dtype = initial_value.dtype.base_dtype
       self._constraint = constraint
       assert initial_value is not None
@@ -445,10 +446,12 @@ class PolymorphicFunction(object):
   @property
   def _cached_input_signatures(self):
     """All input signatures used to call this PolymorphicFunction."""
-    seen = set()
-    # Preserves signature ordering rather than returning a set() so that we
-    # don't need to re-sort signatures later to work around Python 2's set
-    # nondeterminism.
+    seen = list()
+    # We are using a list so that:
+    #  - the returned collection is deterministic, and
+    #  - we can use a custom equality operator (is_same_structure).
+    # This is run only at serialization time on likely very small inputs so we
+    # are not concerned about O(n^2) runtime.
     # pylint: disable=protected-access
     concrete_functions = []
     if self._stateful_fn:
@@ -457,9 +460,11 @@ class PolymorphicFunction(object):
       concrete_functions.extend(self._stateless_fn._function_cache.values())
     for concrete_function in concrete_functions:
       signature = concrete_function._python_call_signature
-      if signature not in seen:
+      equal_to_signature = functools.partial(
+          function_lib.is_same_structure, signature, check_values=True)
+      if not any(equal_to_signature(s) for s in seen):
         yield signature
-        seen.add(signature)
+        seen.append(signature)
     # pylint: enable=protected-access
 
   def get_concrete_function(self, *args, **kwargs):
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 58d1f6b886c789d310286a95abc61e4eb6aebfc3..83cd140158941b58d09844d84b6389b1cac30432 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -41,6 +41,8 @@ from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes as dtypes_module
+from tensorflow.python.framework import error_interpolation
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
@@ -74,6 +76,24 @@ CacheKey = collections.namedtuple("CacheKey", [
 ])
 
 
+def is_same_structure(structure1,
+                      structure2,
+                      check_values=False):
+  """Check two structures for equality, optionally of types and of values."""
+  try:
+    nest.assert_same_structure(structure1, structure2)
+  except (ValueError, TypeError):
+    return False
+  if check_values:
+    flattened1 = nest.flatten(structure1)
+    flattened2 = nest.flatten(structure2)
+    # First check the types to avoid AttributeErrors.
+    if any(type(f1) != type(f2) for f1, f2 in zip(flattened1, flattened2)):
+      return False
+    return flattened1 == flattened2
+  return True
+
+
 def _parse_func_attrs(attributes):
   """Convert the keyword arguments into function_def attributes.
 
@@ -105,7 +125,7 @@ def _parse_func_attrs(attributes):
       attrs[key] = attr_value_pb2.AttrValue(i=value)
     elif isinstance(value, float):
       attrs[key] = attr_value_pb2.AttrValue(f=value)
-    elif isinstance(value, (str, bytes)):
+    elif isinstance(value, (str, bytes, six.text_type)):
       attrs[key] = attr_value_pb2.AttrValue(s=compat.as_bytes(value))
     else:
       raise ValueError("Unsupported attribute type for %s with type %s" %
@@ -113,6 +133,46 @@ def _parse_func_attrs(attributes):
   return attrs
 
 
+class _InterpolateFunctionError(object):
+  """Context Manager that interpolates the exception from 'top_level_func'."""
+
+  def __init__(self, top_level_func):
+    self._func = top_level_func
+
+  def __enter__(self):
+    pass
+
+  def __exit__(self, typ, exc, tb):
+    if not exc or not isinstance(exc, errors.OpError):
+      return False
+    message = compat.as_text(exc.message)
+    _, tags = error_interpolation.parse_message(message)
+    g = None
+    func_stack = []
+    # pylint: disable=protected-access
+    for t in tags:
+      if t.type == "function_node":
+        if t.name == compat.as_str(self._func.name):
+          g = self._func._graph
+        elif g:
+          next_func = g._get_function(t.name)
+          if next_func is not None and isinstance(next_func,
+                                                  _EagerDefinedFunction):
+            g = next_func._graph
+        if g:
+          func_stack.append(g.name)
+        else:
+          func_stack.append("<unknown>")
+    # pylint: enable=protected-access
+    if g:
+      message = error_interpolation.interpolate(message, g)
+      message += "\n\nFunction call stack:\n"
+      message += " -> ".join(func_stack)
+      message += "\n"
+      exc._message = message  # pylint: disable=protected-access
+    return False
+
+
 def _forward_name(n):
   """The name of a generated forward defun named n."""
   return "__forward_%s_%s" % (n, ops.uid())
@@ -261,13 +321,14 @@ class _EagerDefinedFunction(object):
             "Arguments and signature arguments do not match: %s %s " %
             (len(args), len(list(self.signature.input_arg))))
       function_call_options = ctx.get_function_call_options()
-      outputs = functional_ops.partitioned_call(
-          args=args,
-          f=self,
-          tout=self._output_types,
-          executing_eagerly=executing_eagerly,
-          config=function_call_options.config_proto_serialized,
-          executor_type=function_call_options.executor_type)
+      with _InterpolateFunctionError(self):
+        outputs = functional_ops.partitioned_call(
+            args=args,
+            f=self,
+            tout=self._output_types,
+            executing_eagerly=executing_eagerly,
+            config=function_call_options.config_proto_serialized,
+            executor_type=function_call_options.executor_type)
 
     if executing_eagerly:
       return outputs
@@ -383,8 +444,8 @@ class Function(object):
     """
     return self._call_flat(
         (t for t in nest.flatten((args, kwargs))
-         if isinstance(
-             t, (ops.Tensor, resource_variable_ops.ResourceVariable))))
+         if isinstance(t, (ops.Tensor,
+                           resource_variable_ops.ResourceVariable))))
 
   def _call_flat(self, args):
     """Executes the wrapped function.
@@ -919,10 +980,7 @@ class FunctionSpec(object):
     else:
       assert not kwargs
       signature_relevant_inputs = inputs[:len(self.input_signature)]
-      try:
-        nest.assert_same_structure(self.input_signature,
-                                   signature_relevant_inputs)
-      except (ValueError, TypeError):
+      if not is_same_structure(self.input_signature, signature_relevant_inputs):
         raise ValueError("Structure of Python function inputs does not match "
                          "input_signature.")
       signature_inputs_flat = nest.flatten(signature_relevant_inputs)
@@ -1049,9 +1107,7 @@ class PolymorphicFunction(object):
                          "input_signature is provided.")
       if args:
         # If args are provided, they must match the input signature.
-        try:
-          nest.assert_same_structure(self._input_signature, args)
-        except (ValueError, TypeError):
+        if not is_same_structure(self._input_signature, args):
           raise ValueError("Structure of Python function inputs does not match "
                            "input_signature.")
         flat_inputs = nest.flatten(args)
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 55a9cc4e92336452260d0de1991e68ee67dd22e2..1966b259bfd6ab779badf25b2d5549d53b948b5b 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -45,6 +45,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training as keras_training
 from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
@@ -2063,6 +2064,29 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     # function itself is not involved in a reference cycle.
     self.assertIs(None, weak_fn())
 
+  def testFunctionStackInErrorMessage(self):
+
+    @def_function.function()
+    def fn3(x):
+      return x + 2
+
+    @def_function.function()
+    def fn2(x):
+      check_ops.assert_equal(fn3(x), 3)
+      return 2
+
+    @def_function.function()
+    def fn(x):
+      return fn2(x)
+
+    try:
+      fn(2)
+      self.assertFail()
+    except errors.InvalidArgumentError as e:
+      self.assertIn('fn -> fn2', e.message)
+      self.assertIn('node assert_equal/Assert/Assert (defined at', e.message)
+      self.assertNotIn('fn3', e.message)
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution(
diff --git a/tensorflow/python/eager/memory_test.py b/tensorflow/python/eager/memory_test.py
index 5e4516239c9a25dea3ce754a55618f61739ce458..9d29180379bd5bc48472f5c8638f01f667763111 100644
--- a/tensorflow/python/eager/memory_test.py
+++ b/tensorflow/python/eager/memory_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.variables import Variable
 
 # memory_profiler might not be available in the OSS version of TensorFlow.
 try:
@@ -81,6 +82,16 @@ class MemoryTest(test.TestCase):
           "Maximum allowed increase: %f") % (initial, increase,
                                              increase_threshold_absolute_mb)
 
+  def testMemoryLeakAnonymousVariable(self):
+    if memory_profiler is None:
+      self.skipTest("memory_profiler required to run this test")
+
+    def f():
+      inputs = Variable(array_ops.zeros([32, 100], dtypes.float32))
+      del inputs
+
+    self.assertNotIncreasingMemory(f, num_iters=10000)
+
   def testMemoryLeakInSimpleModelForwardOnly(self):
     if memory_profiler is None:
       self.skipTest("memory_profiler required to run this test")
diff --git a/tensorflow/python/eager/tape.py b/tensorflow/python/eager/tape.py
index e501b403a39144a673e8ac5155edf0498425bcd6..56b68b9eea822fa9423543aa4cded8307a817326 100644
--- a/tensorflow/python/eager/tape.py
+++ b/tensorflow/python/eager/tape.py
@@ -61,7 +61,7 @@ def watch(tape, tensor):
 
 def watch_variable(tape, variable):
   """Marks this variable to be watched by the given tape."""
-  strategy = distribution_strategy_context.get_distribution_strategy()
+  strategy = distribution_strategy_context.get_strategy()
   if distribution_strategy_context.get_replica_context():
     variables = [strategy.extended.value_container(variable)]
   else:
@@ -76,7 +76,7 @@ def variable_accessed(variable):
   Args:
     variable: variable to be watched.
   """
-  strategy = distribution_strategy_context.get_distribution_strategy()
+  strategy = distribution_strategy_context.get_strategy()
   if distribution_strategy_context.get_replica_context():
     variables = [strategy.extended.value_container(variable)]
   else:
diff --git a/tensorflow/python/eager/wrap_function.py b/tensorflow/python/eager/wrap_function.py
index 0930b6116d5bef9bc91d999ebbd4462e021fbbe3..6f978feb3cbb7fa6e305c2a0046b62ac40ac9a1a 100644
--- a/tensorflow/python/eager/wrap_function.py
+++ b/tensorflow/python/eager/wrap_function.py
@@ -38,8 +38,20 @@ class VariableHolder(object):
     self._variables = []
 
   def variable_creator_scope(self, next_creator, **kwargs):
+    """Creates variables & adds them to collections to match legacy code."""
     v = next_creator(**kwargs)
     self._variables.append(v)
+
+    collections = kwargs.get("collections")
+    trainable = v.trainable
+
+    if collections is None:
+      collections = [ops.GraphKeys.GLOBAL_VARIABLES]
+    if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
+      collections = list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES]
+
+    ops.add_to_collections(collections, v)
+
     return v
 
   def __call__(self, *args, **kwargs):
@@ -61,6 +73,13 @@ class WrappedFunction(function.Function):
     for f in flat_feeds:
       if not isinstance(f, ops.Tensor):
         raise ValueError("Feeds must be tensors.")
+
+    # Ignoring all feeds that are captures allows prune to be called
+    # using wrapped_func.inputs even when it uses variables
+    internal_captures = self.graph.internal_captures
+    flat_feeds = [f for f in flat_feeds
+                  if f not in internal_captures]
+
     tensor_fetches = []
     operation_fetches = []
     for f in flat_fetches:
@@ -87,7 +106,7 @@ class WrappedFunction(function.Function):
           sink_tensor = control_flow_ops.no_op()
     lift_map = lift_to_graph.lift_to_graph(
         sink_tensor, pruned_graph,
-        sources=flat_feeds + self.graph.internal_captures)
+        sources=flat_feeds + internal_captures)
     for original_fetch, identity_fetch in zip(
         tensor_fetches, identity_fetches):
       lift_map[original_fetch] = lift_map[identity_fetch]
@@ -98,6 +117,8 @@ class WrappedFunction(function.Function):
     pruned_graph.inputs.extend(lift_map[x] for x in flat_feeds)
     pruned_graph.inputs.extend(pruned_graph.captures.values())
 
+    pruned_graph.variables = self.graph.variables
+
     def _structured_output_mapping(fetched):
       lifted = lift_map[fetched]
       if isinstance(lifted, ops.Operation):
@@ -176,6 +197,7 @@ def wrap_function(fn, signature, name=None):
           name,
           holder,
           args=None, kwargs=None, signature=signature,
-          add_control_dependencies=False),
+          add_control_dependencies=False,
+          collections={}),
       variable_holder=holder,
       signature=signature)
diff --git a/tensorflow/python/eager/wrap_function_test.py b/tensorflow/python/eager/wrap_function_test.py
index 65dd73aafca8cb0f6930c334a62083c4d5cd6677..a6e1931fcdac796fe5851211f8aae4b21c7ed83b 100644
--- a/tensorflow/python/eager/wrap_function_test.py
+++ b/tensorflow/python/eager/wrap_function_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -90,11 +91,93 @@ class WrapFunctionTest(test.TestCase):
 
     f_wrapped = wrap_function.wrap_function(f, [])
     self.assertAllEqual(6.0, f_wrapped())
+
+    # Test pruning directly on the inputs
+    pruned = f_wrapped.prune(
+        feeds=f_wrapped.inputs,
+        fetches=f_wrapped.graph.get_tensor_by_name('fetch:0'))
+    self.assertAllEqual(6.0, pruned())
+
+    # Test pruning with no inputs
     pruned = f_wrapped.prune(
         feeds=(),
         fetches=f_wrapped.graph.get_tensor_by_name('fetch:0'))
     self.assertAllEqual(6.0, pruned())
 
+  def testCollectionsIsolation(self):
+
+    v1 = variables.Variable(2.)
+    v2_holder = []
+    def f():
+      v2 = variables.Variable(3.)
+      v2_holder.append(v2)
+      ops.add_to_collection(ops.GraphKeys.LOSSES, v2 * constant_op.constant(3.))
+      return array_ops.identity(v1 * v2 * constant_op.constant(1.), 'fetch')
+
+    f_wrapped = wrap_function.wrap_function(f, [])
+    self.assertAllEqual(6.0, f_wrapped())
+    self.assertEqual(
+        len(f_wrapped.graph.get_collection(ops.GraphKeys.LOSSES)), 1)
+    f_var_collection = f_wrapped.graph.get_collection(
+        ops.GraphKeys.TRAINABLE_VARIABLES)
+    self.assertEqual(len(f_var_collection), 1)
+    self.assertIs(f_var_collection[0], v2_holder[0])
+
+    v3_holder = []
+    def g():
+      v3 = variables.Variable(4.)
+      v3_holder.append(v3)
+      ops.add_to_collection(ops.GraphKeys.LOSSES, v3 * constant_op.constant(3.))
+      return array_ops.identity(v1 * v3 * constant_op.constant(1.), 'fetch')
+
+    g_wrapped = wrap_function.wrap_function(g, [])
+    self.assertAllEqual(8.0, g_wrapped())
+    self.assertEqual(
+        len(g_wrapped.graph.get_collection(ops.GraphKeys.LOSSES)), 1)
+    g_var_collection = g_wrapped.graph.get_collection(
+        ops.GraphKeys.TRAINABLE_VARIABLES)
+    self.assertEqual(len(g_var_collection), 1)
+    self.assertIs(g_var_collection[0], v3_holder[0])
+
+    # Both have only one value, and their values aren't equal. So no sharing.
+    self.assertNotEqual(g_wrapped.graph.get_collection(ops.GraphKeys.LOSSES),
+                        f_wrapped.graph.get_collection(ops.GraphKeys.LOSSES))
+
+  def testGradientsOfPrune(self):
+
+    v1 = variables.Variable(2.)
+    v2_holder = []
+
+    def f(z):
+      v2 = variables.Variable(3.)
+      v2_holder.append(v2)
+      return array_ops.identity(v1 * v2 * z, 'fetch')
+
+    f_wrapped = wrap_function.wrap_function(
+        f, [tensor_spec.TensorSpec((), dtype=dtypes.float32)])
+
+    x = constant_op.constant(1.)
+    with backprop.GradientTape() as tape:
+      tape.watch(x)
+      out = f_wrapped(x)
+    grads = tape.gradient(out, [x, v1, v2_holder[0]])
+
+    self.assertAllEqual(6.0, out)
+    self.assertAllEqual([6.0, 3.0, 2.0], grads)
+
+    pruned = f_wrapped.prune(
+        feeds=f_wrapped.inputs,
+        fetches=f_wrapped.graph.get_tensor_by_name('fetch:0'))
+
+    x = constant_op.constant(1.)
+    with backprop.GradientTape() as tape:
+      tape.watch(x)
+      out = pruned(x)
+    grads = tape.gradient(out, [x, v1, v2_holder[0]])
+
+    self.assertAllEqual(6.0, out)
+    self.assertAllEqual([6.0, 3.0, 2.0], grads)
+
   def testPruneOperations(self):
 
     v = variables.Variable(0)
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index d24a7ae80c86d407ae3bb60ca55fff98be9f27a1..2b986348b7879554daf741cf7bda8f031a4572c2 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -4,7 +4,7 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 py_library(
     name = "feature_column_py",
@@ -94,19 +94,13 @@ filegroup(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "feature_column_test",
     srcs = ["feature_column_test.py"],
-    data = [":vocabulary_testdata"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_cuda_on_cpu_tap",
-        "no_pip",
-        "no_windows",
-    ],
-    deps = [
+    additional_deps = [
         ":feature_column",
         ":feature_column_py",
+        "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -125,24 +119,22 @@ py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        "//third_party/py/numpy",
     ],
-)
-
-py_test(
-    name = "feature_column_v2_test",
-    srcs = ["feature_column_v2_test.py"],
     data = [":vocabulary_testdata"],
-    shard_count = 5,
-    srcs_version = "PY2AND3",
     tags = [
         "no_cuda_on_cpu_tap",
         "no_pip",
         "no_windows",
     ],
-    deps = [
+)
+
+tf_py_test(
+    name = "feature_column_v2_test",
+    srcs = ["feature_column_v2_test.py"],
+    additional_deps = [
         ":feature_column_py",
         ":feature_column_v2",
+        "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -162,6 +154,12 @@ py_test(
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:estimator_py",
-        "//third_party/py/numpy",
+    ],
+    data = [":vocabulary_testdata"],
+    shard_count = 5,
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_pip",
+        "no_windows",
     ],
 )
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index daa0a3b3a4bb5fd067681c5ca91eaccdc64d3144..0ded2bf8c9fc9a7dcf1b100da3258b9e8f30a4b3 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -1832,7 +1832,7 @@ class LinearModelTest(test.TestCase):
       }
     with self.assertRaisesRegexp(
         ValueError,
-        'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        r'Batch size \(first dimension\) of each feature must be same.'):
       fc.linear_model(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
@@ -1847,7 +1847,7 @@ class LinearModelTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc.linear_model(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
@@ -2467,7 +2467,7 @@ class _LinearModelTest(test.TestCase):
       }
     with self.assertRaisesRegexp(
         ValueError,
-        'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
       get_keras_linear_model_predictions(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
@@ -2482,7 +2482,7 @@ class _LinearModelTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         get_keras_linear_model_predictions(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
@@ -2974,7 +2974,7 @@ class FunctionalInputLayerTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc.input_layer(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
@@ -2989,7 +2989,7 @@ class FunctionalInputLayerTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc.input_layer(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index a2474253697ad526c33c0099bf955b96000cf0f7..aeaa8df8b5ee0851d4821663ad975ac401ab69f5 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -2015,7 +2015,7 @@ class LinearModelTest(test.TestCase):
       }
       model(features)
       for var in model.variables:
-        self.assertTrue(isinstance(var, variables_lib.RefVariable))
+        self.assertIsInstance(var, variables_lib.VariableV1)
       variable_names = [var.name for var in model.variables]
       self.assertItemsEqual([
           'linear_model/dense_feature_bucketized/weights:0',
@@ -2052,7 +2052,7 @@ class LinearModelTest(test.TestCase):
       }
     with self.assertRaisesRegexp(
         ValueError,
-        'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
       model = fc.LinearModel([price1, price2])
       model(features)
 
@@ -2068,7 +2068,7 @@ class LinearModelTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         model = fc.LinearModel([price1, price2, price3])
         model(features)
 
@@ -2818,7 +2818,7 @@ class OldLinearModelTest(test.TestCase):
       }
     with self.assertRaisesRegexp(
         ValueError,
-        'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
       fc_old.linear_model(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
@@ -2833,7 +2833,7 @@ class OldLinearModelTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc_old.linear_model(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
@@ -3435,7 +3435,7 @@ class DenseFeaturesTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc.DenseFeatures([price1, price2])(features)
 
   def test_subset_of_static_batch_size_mismatch(self):
@@ -3450,7 +3450,7 @@ class DenseFeaturesTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc.DenseFeatures([price1, price2, price3])(features)
 
   def test_runtime_batch_size_mismatch(self):
@@ -4010,7 +4010,7 @@ class FunctionalInputLayerTest(test.TestCase):
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(1, len(cols_to_vars[some_embedding_column]))
       self.assertIsInstance(cols_to_vars[some_embedding_column][0],
-                            variables_lib.Variable)
+                            variables_lib.VariableV1)
       self.assertAllEqual(cols_to_vars[some_embedding_column][0].shape, [5, 10])
 
   @test_util.run_deprecated_v1
@@ -4141,7 +4141,7 @@ class FunctionalInputLayerTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc_old.input_layer(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
@@ -4156,7 +4156,7 @@ class FunctionalInputLayerTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc_old.input_layer(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
@@ -6839,7 +6839,7 @@ class EmbeddingColumnTest(test.TestCase):
     self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
     for v in global_vars:
-      self.assertTrue(isinstance(v, variables_lib.RefVariable))
+      self.assertIsInstance(v, variables_lib.Variable)
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
     self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in trainable_vars]))
@@ -7732,7 +7732,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
         tuple([v.name for v in global_vars]))
     for v in global_vars:
-      self.assertTrue(isinstance(v, variables_lib.RefVariable))
+      self.assertIsInstance(v, variables_lib.Variable)
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
     if trainable:
       self.assertItemsEqual(
diff --git a/tensorflow/python/framework/error_interpolation.py b/tensorflow/python/framework/error_interpolation.py
index 5e1bed8e0e2afdc3b5b5c89b4a7daca780273831..af83b70a465cd061c2ed713639cc4a5d531f388d 100644
--- a/tensorflow/python/framework/error_interpolation.py
+++ b/tensorflow/python/framework/error_interpolation.py
@@ -31,7 +31,7 @@ import six
 
 from tensorflow.python.util import tf_stack
 
-_NAME_REGEX = r"[A-Za-z0-9.][A-Za-z0-9_.\-/]*?"
+_NAME_REGEX = r"[A-Za-z0-9_.][A-Za-z0-9_.\-/]*?"
 _TAG_REGEX = r"{{{{({name}) ({name})}}}}".format(name=_NAME_REGEX)
 _INTERPOLATION_REGEX = r"^(.*?)({tag})".format(tag=_TAG_REGEX)
 _INTERPOLATION_PATTERN = re.compile(_INTERPOLATION_REGEX, re.DOTALL)
@@ -45,7 +45,7 @@ _BAD_FILE_SUBSTRINGS = [
 ]
 
 
-def _parse_message(message):
+def parse_message(message):
   """Parses the message.
 
   Splits the message into separators and tags. Tags are named tuples
@@ -376,7 +376,7 @@ def interpolate(error_message, graph):
   Returns:
     The string with tags of the form {{type name}} interpolated.
   """
-  seps, tags = _parse_message(error_message)
+  seps, tags = parse_message(error_message)
   subs = []
   end_msg = collections.defaultdict(list)
   tagged_ops = []
@@ -404,6 +404,8 @@ def interpolate(error_message, graph):
         msg = "node %s%s placed on device %s " % (
             ops[0].name, field_dict["defined_at"], field_dict["devices"])
         end_msg["colocations"].append(field_dict["devs_and_colocs"])
+    if tag.type == "function_node":
+      msg = ""
     subs.append(msg)
 
   if "source_nodes" in end_msg:
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index 9528a24b46b3e7e76df7355241cafd1003542f11..9603c1536d375394b43675e318ad0a2233c0b928 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
+import collections as py_collections
 import weakref
 
 from tensorflow.core.framework import attr_value_pb2
@@ -80,7 +80,7 @@ class FuncGraph(ops.Graph):
     seed: The graph-level random seed.
   """
 
-  def __init__(self, name, read_only_collections=True):
+  def __init__(self, name, collections=None):
     """Construct a new FuncGraph.
 
     The graph will inherit its graph key, collections, seed, and distribution
@@ -88,8 +88,13 @@ class FuncGraph(ops.Graph):
 
     Args:
       name: the name of the function.
-      read_only_collections: whether to not write function graph collections
-        back to default graph. Defaults to True.
+      collections: a dictionary of collections this FuncGraph should start
+        with. If not specified (None), the FuncGraph will read (but not write
+        to) the outer graph's collections that are not whitelisted, and both
+        read and write to the outer graph's collections that are whitelisted.
+        The current whitelisted collections are the global variables, the
+        local variables, and the trainable variables.
+        Defaults to None.
     """
     super(FuncGraph, self).__init__()
 
@@ -97,10 +102,9 @@ class FuncGraph(ops.Graph):
     self.inputs = []
     self.outputs = []
     self.structured_outputs = None
-    self._read_only_collections = read_only_collections
     self._weak_variables = []
     self.outer_graph = ops.get_default_graph()
-    self.captures = collections.OrderedDict()
+    self.captures = py_collections.OrderedDict()
 
     self._building_function = True
     # Map from resource tensor name to last op (in program order) which uses
@@ -122,9 +126,7 @@ class FuncGraph(ops.Graph):
       # specialization (currently used in cond_v2), here and in the cache key.
       self._colocation_stack = graph._colocation_stack.copy()  # pylint: disable=protected-access
 
-    if not self._read_only_collections:
-      self._collections = graph._collections  # pylint: disable=protected-access
-    else:
+    if collections is None:
       for collection_name in graph.get_all_collection_keys():
         if collection_name not in WHITELIST_COLLECTIONS:
           self._collections[collection_name] = graph.get_collection(
@@ -132,6 +134,8 @@ class FuncGraph(ops.Graph):
       for collection_name in WHITELIST_COLLECTIONS:
         self._collections[collection_name] = graph.get_collection_ref(
             collection_name)
+    else:
+      self._collections = collections
 
   def as_default(self):
     outer_cm = super(FuncGraph, self).as_default()
@@ -338,7 +342,8 @@ def func_graph_from_py_func(name,
                             autograph=False,
                             add_control_dependencies=True,
                             arg_names=None,
-                            op_return_value=None):
+                            op_return_value=None,
+                            collections=None):
   """Returns a `FuncGraph` generated from `python_func`.
 
   Args:
@@ -365,6 +370,13 @@ def func_graph_from_py_func(name,
     op_return_value: Optional. A Tensor. If set and `python_func` returns
       Operations, those return values will be replaced with this value. If not
       set, returning an Operation triggers an error.
+    collections: a dictionary of collections this FuncGraph should start
+      with. If not specified (None), the FuncGraph will read (but not write to)
+      the outer graph's collections that are not whitelisted, and both
+      read and write to the outer graph's collections that are whitelisted.
+      The current whitelisted collections are the global variables, the
+      local variables, and the trainable variables.
+      Defaults to None.
 
   Returns:
     A FuncGraph.
@@ -376,7 +388,7 @@ def func_graph_from_py_func(name,
   if op_return_value is not None:
     assert isinstance(op_return_value, ops.Tensor), op_return_value
   if func_graph is None:
-    func_graph = FuncGraph(name)
+    func_graph = FuncGraph(name, collections=collections)
   assert isinstance(func_graph, FuncGraph)
   if add_control_dependencies:
     control_manager = AutomaticControlDependencies
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index e7a9af4866246a87755fcae27446da4045487f77..079fad4210f04bce81a1b4e528e4c65b9794dfaa 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -5474,6 +5474,9 @@ def disable_eager_execution():
   projects from TensorFlow 1.x to 2.x.
   """
   context.default_execution_mode = context.GRAPH_MODE
+  c = context.context_safe()
+  if c is not None:
+    c._eager_context.is_eager = False  # pylint: disable=protected-access
 
 
 def enable_eager_execution_internal(config=None,
@@ -6026,7 +6029,15 @@ class name_scope(object):  # pylint: disable=invalid-name
       name: The name argument that is passed to the op function.
       default_name: The default name to use if the `name` argument is `None`.
       values: The list of `Tensor` arguments that are passed to the op function.
+
+    Raises:
+      TypeError: if `default_name` is passed in but not a string.
     """
+    if not (default_name is None or isinstance(default_name, six.string_types)):
+      raise TypeError(
+          "`default_name` type (%s) is not a string type. You likely meant to "
+          "pass this into the `values` kwarg."
+          % type(default_name))
     self._name = default_name if name is None else name
     self._default_name = default_name
     self._values = values
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 2d7ee1a99e02cbb663df38ae17d8772fa6f11816..58d311fe4e7e645d1a9965208638c505195a2563 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -2052,6 +2052,9 @@ class OpScopeTest(test_util.TensorFlowTestCase):
     with ops.name_scope(None, default_scope_name, [a, b]) as scope:
       self.assertEqual("%s/" % default_scope_name, scope)
       self.assertEqual(g0, ops.get_default_graph())
+    with self.assertRaises(TypeError):
+      with ops.name_scope(scope_name, [a, b]):
+        pass
 
   def _testGraphElements(self, graph_elements):
     scope_name = "my_scope"
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index 5e1a95a26be034bff0a1f5eb996ac6f16c61e282..8546c2299aad8f6145f8dd59c3c51410038d8847 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -113,16 +113,12 @@ class SparseTensor(_TensorLike):
       dense_shape: A 1-D int64 tensor of shape `[ndims]`.
 
     """
-    with ops.name_scope(None, "SparseTensor",
-                        [indices, values, dense_shape]):
+    with ops.name_scope(None, "SparseTensor", [indices, values, dense_shape]):
       indices = ops.convert_to_tensor(
           indices, name="indices", dtype=dtypes.int64)
-      # Always pass as_ref=True because we want to be able to update
-      # values later if it is a VariableOp.
       # TODO(touts): Consider adding mutable_values() when 'values'
       # is a VariableOp and updating users of SparseTensor.
-      values = ops.internal_convert_to_tensor(
-          values, name="values", as_ref=True)
+      values = ops.internal_convert_to_tensor(values, name="values")
       dense_shape = ops.convert_to_tensor(
           dense_shape, name="dense_shape", dtype=dtypes.int64)
     self._indices = indices
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index a4c626c64c9557ef9562660ffcb712a79bc01cbd..a7537bb5f1adfe70018f50cb9a627bfffe176226 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -271,10 +271,11 @@ class Dimension(object):
     Dimensions are combined as follows:
 
     ```python
-    tf.Dimension(n)   .merge_with(tf.Dimension(n))    == tf.Dimension(n)
-    tf.Dimension(n)   .merge_with(tf.Dimension(None)) == tf.Dimension(n)
-    tf.Dimension(None).merge_with(tf.Dimension(n))    == tf.Dimension(n)
-    tf.Dimension(None).merge_with(tf.Dimension(None)) == tf.Dimension(None)
+    tf.Dimension(n)   .merge_with(tf.Dimension(n))     == tf.Dimension(n)
+    tf.Dimension(n)   .merge_with(tf.Dimension(None))  == tf.Dimension(n)
+    tf.Dimension(None).merge_with(tf.Dimension(n))     == tf.Dimension(n)
+    # equivalent to tf.Dimension(None)
+    tf.Dimension(None).merge_with(tf.Dimension(None))
 
     # raises ValueError for n != m
     tf.Dimension(n)   .merge_with(tf.Dimension(m))
@@ -304,10 +305,10 @@ class Dimension(object):
     Dimensions are summed as follows:
 
     ```python
-    tf.Dimension(m)    + tf.Dimension(n)    == tf.Dimension(m + n)
-    tf.Dimension(m)    + tf.Dimension(None) == tf.Dimension(None)
-    tf.Dimension(None) + tf.Dimension(n)    == tf.Dimension(None)
-    tf.Dimension(None) + tf.Dimension(None) == tf.Dimension(None)
+    tf.Dimension(m)    + tf.Dimension(n)     == tf.Dimension(m + n)
+    tf.Dimension(m)    + tf.Dimension(None)  # equiv. to tf.Dimension(None)
+    tf.Dimension(None) + tf.Dimension(n)     # equiv. to tf.Dimension(None)
+    tf.Dimension(None) + tf.Dimension(None)  # equiv. to tf.Dimension(None)
     ```
 
     Args:
@@ -339,10 +340,10 @@ class Dimension(object):
     Dimensions are subtracted as follows:
 
     ```python
-    tf.Dimension(m)    - tf.Dimension(n)    == tf.Dimension(m - n)
-    tf.Dimension(m)    - tf.Dimension(None) == tf.Dimension(None)
-    tf.Dimension(None) - tf.Dimension(n)    == tf.Dimension(None)
-    tf.Dimension(None) - tf.Dimension(None) == tf.Dimension(None)
+    tf.Dimension(m)    - tf.Dimension(n)     == tf.Dimension(m - n)
+    tf.Dimension(m)    - tf.Dimension(None)  # equiv. to tf.Dimension(None)
+    tf.Dimension(None) - tf.Dimension(n)     # equiv. to tf.Dimension(None)
+    tf.Dimension(None) - tf.Dimension(None)  # equiv. to tf.Dimension(None)
     ```
 
     Args:
@@ -378,10 +379,10 @@ class Dimension(object):
     Dimensions are summed as follows:
 
     ```python
-    tf.Dimension(m)    * tf.Dimension(n)    == tf.Dimension(m * n)
-    tf.Dimension(m)    * tf.Dimension(None) == tf.Dimension(None)
-    tf.Dimension(None) * tf.Dimension(n)    == tf.Dimension(None)
-    tf.Dimension(None) * tf.Dimension(None) == tf.Dimension(None)
+    tf.Dimension(m)    * tf.Dimension(n)     == tf.Dimension(m * n)
+    tf.Dimension(m)    * tf.Dimension(None)  # equiv. to tf.Dimension(None)
+    tf.Dimension(None) * tf.Dimension(n)     # equiv. to tf.Dimension(None)
+    tf.Dimension(None) * tf.Dimension(None)  # equiv. to tf.Dimension(None)
     ```
 
     Args:
@@ -417,10 +418,10 @@ class Dimension(object):
     Dimensions are divided as follows:
 
     ```python
-    tf.Dimension(m)    // tf.Dimension(n)    == tf.Dimension(m // n)
-    tf.Dimension(m)    // tf.Dimension(None) == tf.Dimension(None)
-    tf.Dimension(None) // tf.Dimension(n)    == tf.Dimension(None)
-    tf.Dimension(None) // tf.Dimension(None) == tf.Dimension(None)
+    tf.Dimension(m)    // tf.Dimension(n)     == tf.Dimension(m // n)
+    tf.Dimension(m)    // tf.Dimension(None)  # equiv. to tf.Dimension(None)
+    tf.Dimension(None) // tf.Dimension(n)     # equiv. to tf.Dimension(None)
+    tf.Dimension(None) // tf.Dimension(None)  # equiv. to tf.Dimension(None)
     ```
 
     Args:
@@ -475,10 +476,10 @@ class Dimension(object):
     Dimension moduli are computed as follows:
 
     ```python
-    tf.Dimension(m)    % tf.Dimension(n)    == tf.Dimension(m % n)
-    tf.Dimension(m)    % tf.Dimension(None) == tf.Dimension(None)
-    tf.Dimension(None) % tf.Dimension(n)    == tf.Dimension(None)
-    tf.Dimension(None) % tf.Dimension(None) == tf.Dimension(None)
+    tf.Dimension(m)    % tf.Dimension(n)     == tf.Dimension(m % n)
+    tf.Dimension(m)    % tf.Dimension(None)  # equiv. to tf.Dimension(None)
+    tf.Dimension(None) % tf.Dimension(n)     # equiv. to tf.Dimension(None)
+    tf.Dimension(None) % tf.Dimension(None)  # equiv. to tf.Dimension(None)
     ```
 
     Args:
diff --git a/tensorflow/python/framework/test_ops.cc b/tensorflow/python/framework/test_ops.cc
index 99e184a8acd44012774917c4baaecd48bae6cbe3..1d0145f61c84969cf1b52eb070ec3f933d25741a 100644
--- a/tensorflow/python/framework/test_ops.cc
+++ b/tensorflow/python/framework/test_ops.cc
@@ -157,7 +157,7 @@ REGISTER_KERNEL_BUILDER(Name("Old").Device(DEVICE_CPU), OldOp);
 // Stubbed-out resource to test resource handle ops.
 class StubResource : public ResourceBase {
  public:
-  string DebugString() override { return ""; }
+  string DebugString() const override { return ""; }
 };
 
 REGISTER_RESOURCE_HANDLE_KERNEL(StubResource);
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 7a6ecaf5a876c93014e89c5aa90d2ad4fef6f7eb..6d01d3bf546297499f9244a94e8069caa6def9c2 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -373,7 +373,7 @@ def skip_if(condition):
       else:
         skip = condition
       if not skip:
-        fn(*args, **kwargs)
+        return fn(*args, **kwargs)
 
     return wrapper
 
@@ -410,7 +410,7 @@ def enable_control_flow_v2(fn):
     enable_control_flow_v2_old = control_flow_util.ENABLE_CONTROL_FLOW_V2
     control_flow_util.ENABLE_CONTROL_FLOW_V2 = True
     try:
-      fn(*args, **kwargs)
+      return fn(*args, **kwargs)
     finally:
       control_flow_util.ENABLE_CONTROL_FLOW_V2 = enable_control_flow_v2_old
 
@@ -594,9 +594,9 @@ def assert_no_new_tensors(f):
       ops.get_default_graph()._graph_key = outside_graph_key
       if outside_executed_eagerly:
         with context.eager_mode():
-          f(self, **kwargs)
+          result = f(self, **kwargs)
       else:
-        f(self, **kwargs)
+        result = f(self, **kwargs)
     # Make an effort to clear caches, which would otherwise look like leaked
     # Tensors.
     context.context()._clear_caches()  # pylint: disable=protected-access
@@ -610,6 +610,7 @@ def assert_no_new_tensors(f):
           len(tensors_after),
           str(tensors_after),
       )))
+    return result
 
   return decorator
 
@@ -734,14 +735,14 @@ def assert_no_garbage_created(f):
     """Sets DEBUG_SAVEALL, runs the test, and checks for new garbage."""
     # Force-load `distribution_strategy_context` to prevent GC at
     # test time when using eager. Remove once b/117329403 is resolved.
-    tape.distribution_strategy_context.get_distribution_strategy()
+    tape.distribution_strategy_context.get_strategy()
 
     gc.disable()
     previous_debug_flags = gc.get_debug()
     gc.set_debug(gc.DEBUG_SAVEALL)
     gc.collect()
     previous_garbage = len(gc.garbage)
-    f(self, **kwargs)
+    result = f(self, **kwargs)
     gc.collect()
     new_garbage = len(gc.garbage)
     if new_garbage > previous_garbage:
@@ -786,6 +787,7 @@ def assert_no_garbage_created(f):
     # not hold on to every object in other tests.
     gc.set_debug(previous_debug_flags)
     gc.enable()
+    return result
 
   return decorator
 
@@ -1074,9 +1076,9 @@ def deprecated_graph_mode_only(func=None):
     def decorated(self, *args, **kwargs):
       if tf2.enabled():
         with context.graph_mode():
-          f(self, *args, **kwargs)
+          return f(self, *args, **kwargs)
       else:
-        f(self, *args, **kwargs)
+        return f(self, *args, **kwargs)
 
     return decorated
 
@@ -1126,7 +1128,7 @@ def run_v1_only(reason, func=None):
       if tf2.enabled():
         self.skipTest(reason)
 
-      f(self, *args, **kwargs)
+      return f(self, *args, **kwargs)
 
     return decorated
 
@@ -1163,7 +1165,7 @@ def run_v2_only(func=None):
       if not tf2.enabled():
         self.skipTest("Test is only comptaible in v2")
 
-      f(self, *args, **kwargs)
+      return f(self, *args, **kwargs)
 
     return decorated
 
@@ -1196,7 +1198,7 @@ def run_gpu_only(func=None):
       if not is_gpu_available():
         self.skipTest("Test requires GPU")
 
-      f(self, *args, **kwargs)
+      return f(self, *args, **kwargs)
 
     return decorated
 
@@ -1229,7 +1231,7 @@ def run_cuda_only(func=None):
       if not is_gpu_available(cuda_only=True):
         self.skipTest("Test requires CUDA GPU")
 
-      f(self, *args, **kwargs)
+      return f(self, *args, **kwargs)
 
     return decorated
 
diff --git a/tensorflow/python/grappler/cluster.i b/tensorflow/python/grappler/cluster.i
index 87795ffcfb5d21c408d646e581e19fe23a37b945..b0c1f71a851e9bda2f8e75bd1db1daf566446805 100644
--- a/tensorflow/python/grappler/cluster.i
+++ b/tensorflow/python/grappler/cluster.i
@@ -132,7 +132,7 @@ struct GCluster {
 
 static GCluster TF_NewCluster(bool allow_soft_placement,
                    bool disable_detailed_stats, TF_Status* out_status) {
-    int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores();
+  int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores();
   int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();
   int timeout_s = 60 * 10;
   tensorflow::grappler::Cluster* cluster_ =
diff --git a/tensorflow/python/grappler/cluster.py b/tensorflow/python/grappler/cluster.py
index 079d07115b31da86600821a098aec08ec60bf436..428b52402cffc16bd692cac5839494a617815236 100644
--- a/tensorflow/python/grappler/cluster.py
+++ b/tensorflow/python/grappler/cluster.py
@@ -71,26 +71,21 @@ class Cluster(object):
     return self._tf_cluster
 
   def ListDevices(self):
-    """Returns the list of available hardware devices."""
-    devices = []
-    if self._tf_cluster is not None:
-      ret_from_swig = tf_cluster.TF_ListDevices(self._tf_cluster)
-      devices = []
-      for raw_dev in ret_from_swig:
-        devices.append(device_properties_pb2.NamedDevice.FromString(raw_dev))
-    return devices
+    """Returns a list of available hardware devices."""
+    if self._tf_cluster is None:
+      return []
+    return [device_properties_pb2.NamedDevice.FromString(device)
+            for device in tf_cluster.TF_ListDevices(self._tf_cluster)]
 
   def ListAvailableOps(self):
-    """Returns a list of all the available operations (sorted alphatically)."""
+    """Returns a list of all available operations (sorted alphabetically)."""
     return tf_cluster.TF_ListAvailableOps()
 
   def GetSupportedDevices(self, item):
     return tf_cluster.TF_GetSupportedDevices(self._tf_cluster, item.tf_item)
 
   def EstimatePerformance(self, device):
-    """Estimate the performance of the specified device."""
-    serialized = device.SerializeToString()
-    return tf_cluster.TF_EstimatePerformance(serialized)
+    return tf_cluster.TF_EstimatePerformance(device.SerializeToString())
 
   def MeasureCosts(self, item):
     """Returns the cost of running the specified item.
@@ -107,10 +102,8 @@ class Cluster(object):
       return None
 
     op_perf_bytes_list, run_time, step_stats_bytes = ret_from_swig
-    op_perfs = []
-    for op_perf_bytes in op_perf_bytes_list:
-      op_perfs.append(
-          op_performance_data_pb2.OpPerformance.FromString(op_perf_bytes))
+    op_perfs = [op_performance_data_pb2.OpPerformance.FromString(op_perf_bytes)
+                for op_perf_bytes in op_perf_bytes_list]
     return (op_perfs, run_time,
             step_stats_pb2.StepStats.FromString(step_stats_bytes))
 
@@ -122,11 +115,9 @@ class Cluster(object):
     Returns: A hashtable indexed by device name.
     """
     with errors.raise_exception_on_not_ok_status() as status:
-      ret_from_swig = tf_cluster.TF_DeterminePeakMemoryUsage(
+      return tf_cluster.TF_DeterminePeakMemoryUsage(
           item.tf_item, self._tf_cluster, status)
 
-    return ret_from_swig
-
 
 @contextlib.contextmanager
 def Provision(allow_soft_placement=True,
diff --git a/tensorflow/python/grappler/cost_analyzer.cc b/tensorflow/python/grappler/cost_analyzer.cc
index b474e19894957d01c7c8978282c547df81a9b2b3..bb8c6d5b85565713f7753626775a2f405b7243b7 100644
--- a/tensorflow/python/grappler/cost_analyzer.cc
+++ b/tensorflow/python/grappler/cost_analyzer.cc
@@ -42,9 +42,13 @@ Status CostAnalyzer::GenerateReport(std::ostream& os, bool per_node_report,
 void CostAnalyzer::PredictCosts(CostEstimator* cost_estimator,
                                 CostGraphDef* cost_graph, int64* total_time) {
   TF_CHECK_OK(cost_estimator->Initialize(*item_));
+  RunMetadata run_metadata;
   Costs costs;
-  const Status status =
-      cost_estimator->PredictCosts(item_->graph, cost_graph, &costs);
+  const Status status = cost_estimator->PredictCostsAndReturnRunMetadata(
+      item_->graph, &run_metadata, &costs);
+  if (cost_graph) {
+    cost_graph->Swap(run_metadata.mutable_cost_graph());
+  }
   *total_time = costs.execution_time.count();
   if (!status.ok()) {
     LOG(ERROR) << "Could not estimate the cost for item " << item_->id << ": "
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index d833ae0fa21b9c54a93a1d0a9e1be4e228bd48a3..fb0161a677c7dd3bd1c9f08dc93a7cd60ff7a2ad 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -1,5 +1,7 @@
 # Description:
 #   Contains the Keras API (internal TensorFlow version).
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 licenses(["notice"])  # Apache 2.0
 
@@ -7,9 +9,6 @@ package(default_visibility = ["//visibility:public"])
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
-load("//tensorflow:tensorflow.bzl", "cuda_py_test")
-
 config_setting(
     name = "empty_condition",
     values = {"define": "UNUSED=unused"},
@@ -304,180 +303,167 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "integration_test",
     size = "medium",
     srcs = ["integration_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:layers",
         "//tensorflow/python:nn",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "activations_test",
     size = "small",
     srcs = ["activations_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "constraints_test",
     size = "small",
     srcs = ["constraints_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "initializers_test",
     size = "small",
     srcs = ["initializers_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:init_ops",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "regularizers_test",
     size = "small",
     srcs = ["regularizers_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
         "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "optimizers_test",
     size = "medium",
     srcs = ["optimizers_test.py"],
-    shard_count = 2,
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:training",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
+    shard_count = 4,
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "losses_test",
     size = "small",
     srcs = ["losses_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "metrics_functional_test",
     size = "medium",
     srcs = ["metrics_functional_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "metrics_test",
     size = "medium",
     srcs = ["metrics_test.py"],
-    shard_count = 4,
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 4,
 )
 
-py_test(
+tf_py_test(
     name = "applications_test",
     size = "enormous",
     srcs = ["applications/applications_test.py"],
-    shard_count = 2,
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
         "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 2,
 )
 
-py_test(
+tf_py_test(
     name = "advanced_activations_test",
     size = "medium",
     srcs = ["layers/advanced_activations_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
         "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "convolutional_recurrent_test",
     size = "large",
     srcs = ["layers/convolutional_recurrent_test.py"],
-    shard_count = 2,
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 2,
 )
 
-py_test(
+tf_py_test(
     name = "convolutional_test",
     size = "large",
     srcs = ["layers/convolutional_test.py"],
-    shard_count = 11,
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 11,
 )
 
 cuda_py_test(
@@ -494,30 +480,29 @@ cuda_py_test(
     tags = ["no_windows_gpu"],
 )
 
-py_test(
+tf_py_test(
     name = "pooling_test",
-    size = "large",
+    size = "medium",
     srcs = ["layers/pooling_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
         "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 8,
 )
 
-py_test(
+tf_py_test(
     name = "core_test",
     size = "medium",
     srcs = ["layers/core_test.py"],
-    shard_count = 3,
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 3,
 )
 
 cuda_py_test(
@@ -531,121 +516,113 @@ cuda_py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "local_test",
     size = "medium",
     srcs = ["layers/local_test.py"],
-    shard_count = 2,
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 2,
+    tags = ["no_windows"],
 )
 
-py_test(
+tf_py_test(
     name = "merge_test",
     size = "small",
     srcs = ["layers/merge_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "noise_test",
     size = "small",
     srcs = ["layers/noise_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
         "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "normalization_test",
     size = "medium",
     srcs = ["layers/normalization_test.py"],
-    shard_count = 3,
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 3,
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "simplernn_test",
     size = "medium",
     srcs = ["layers/simplernn_test.py"],
-    shard_count = 4,
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 4,
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "gru_test",
     size = "large",
     srcs = ["layers/gru_test.py"],
-    shard_count = 2,
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],  # http://b/62136390
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 2,
+    tags = ["notsan"],  # http://b/62136390
 )
 
-py_test(
+tf_py_test(
     name = "lstm_test",
     size = "medium",
     srcs = ["layers/lstm_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
     shard_count = 4,
-    srcs_version = "PY2AND3",
     tags = [
         "noasan",  # times out b/63678675
         "notsan",  # http://b/62189182
     ],
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
 )
 
-py_test(
+tf_py_test(
     name = "recurrent_test",
     size = "medium",
     srcs = ["layers/recurrent_test.py"],
-    shard_count = 4,
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 4,
 )
 
 cuda_py_test(
@@ -674,55 +651,57 @@ cuda_py_test(
     shard_count = 6,
 )
 
-py_test(
+tf_py_test(
     name = "serialization_test",
     size = "small",
     srcs = ["layers/serialization_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
         "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "wrappers_test",
     size = "medium",
     srcs = ["layers/wrappers_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
     shard_count = 4,
-    srcs_version = "PY2AND3",
     tags = [
         "noasan",  # http://b/78599823
         "notsan",
     ],
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
 )
 
-py_test(
+tf_py_test(
     name = "scikit_learn_test",
     size = "small",
     srcs = ["wrappers/scikit_learn_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "data_utils_test",
     size = "large",
     srcs = ["utils/data_utils_test.py"],
-    srcs_version = "PY2AND3",
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
     tags = [
         "no_oss",
         "no_windows",
@@ -730,64 +709,54 @@ py_test(
         "notsan",
         "optonly",  # times out
     ],
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
 )
 
-py_test(
+tf_py_test(
     name = "generic_utils_test",
     size = "small",
     srcs = ["utils/generic_utils_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
         "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "tf_utils_test",
     size = "small",
     srcs = ["utils/tf_utils_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "io_utils_test",
     size = "small",
     srcs = ["utils/io_utils_test.py"],
-    srcs_version = "PY2AND3",
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
     tags = [
         "no_windows",  # TODO: needs investigation on Windows
         "notsan",
     ],
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
 )
 
-py_test(
+tf_py_test(
     name = "np_utils_test",
     size = "small",
     srcs = ["utils/np_utils_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
@@ -818,287 +787,267 @@ cuda_py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "conv_utils_test",
     size = "small",
     srcs = ["utils/conv_utils_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "image_test",
     size = "medium",
     srcs = ["preprocessing/image_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "sequence_test",
     size = "small",
     srcs = ["preprocessing/sequence_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "text_test",
     size = "small",
     srcs = ["preprocessing/text_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "callbacks_test",
     size = "medium",
     srcs = ["callbacks_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "correctness_test",
     size = "medium",
     srcs = ["engine/correctness_test.py"],
-    shard_count = 2,
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 2,
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "training_test",
     size = "medium",
     srcs = ["engine/training_test.py"],
-    shard_count = 16,
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 16,
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "training_dataset_test",
     size = "medium",
     srcs = ["engine/training_dataset_test.py"],
-    shard_count = 4,
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 4,
 )
 
-py_test(
+tf_py_test(
     name = "training_generator_test",
     size = "large",
     srcs = ["engine/training_generator_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
     shard_count = 3,
-    srcs_version = "PY2AND3",
     tags = [
         "no_oss",
         "notsan",
     ],
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
 )
 
-py_test(
+tf_py_test(
     name = "feature_columns_integration_test",
     size = "small",
     srcs = ["engine/feature_columns_integration_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/feature_column:feature_column_py",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "training_eager_test",
     size = "medium",
     srcs = ["engine/training_eager_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "training_utils_test",
     size = "medium",
     srcs = ["engine/training_utils_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "model_subclassing_test",
     size = "medium",
     srcs = ["model_subclassing_test.py"],
-    shard_count = 4,
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 4,
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "topology_test",
     size = "medium",
     srcs = ["engine/topology_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no-internal-py3",
-    ],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    tags = [
+        "no-internal-py3",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "base_layer_test",
     size = "small",
     srcs = ["engine/base_layer_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "saving_test",
     size = "medium",
     srcs = ["engine/saving_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "sequential_test",
     size = "medium",
     srcs = ["engine/sequential_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "models_test",
     size = "medium",
     srcs = ["models_test.py"],
-    shard_count = 2,
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],  # b/67509773
-    deps = [
+    additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:training",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
+    shard_count = 8,
+    tags = ["notsan"],  # b/67509773
 )
 
-py_test(
+tf_py_test(
     name = "backend_test",
     size = "medium",
     srcs = ["backend_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:util",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "keras_parameterized_test",
     size = "small",
     srcs = ["keras_parameterized_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    tags = ["notsan"],
 )
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 42d94e77a0585250cd234d1813e1b366f95aba94..6693243a684b1c7008361524dc40f3c7cde15952 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -2794,6 +2794,11 @@ def get_value(x):
   """
   if context.executing_eagerly():
     return x.numpy()
+  elif not getattr(x, '_in_graph_mode', True):
+    # This is a variable which was created in an eager context, but is being
+    # evaluated from a Graph.
+    with context.eager_mode():
+      return x.numpy()
   elif ops.inside_function():
     raise RuntimeError('Cannot get value inside Tensorflow graph function.')
   return x.eval(session=get_session())
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 589cd992d6f59262889f663c8ab149a0b3cd186a..b9074669eb8243a5bb7f2aaa2da3ab3d5c5ce833 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -123,14 +123,6 @@ def configure_callbacks(callbacks,
       'metrics': callback_metrics,
   }
   callback_list.set_params(callback_params)
-
-  if (do_validation and not model._distribution_strategy and
-      not model.run_eagerly):
-    # Need to create the eval_function before start of the first epoch
-    # because TensorBoard callback on_epoch_begin adds summary to the
-    # list of fetches of the eval_function
-    callback_model._make_eval_function()
-
   callback_list.model.stop_training = False
   return callback_list
 # pylint: enable=protected-access
@@ -1373,6 +1365,7 @@ class TensorBoard(Callback):
       self._epoch = epoch
       # pylint: disable=protected-access
       # add the histogram summary op if it should run this epoch
+      self.model._make_eval_function()
       if self.merged not in self.model._eval_function.fetches:
         self.model._eval_function.fetches.append(self.merged)
         self.model._eval_function.fetch_callbacks[
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index ef469c5e4f5deb3e4f0cff7cb3deea95d0266d9b..d5ffdf789ae7b1d466d32d9c875f4f7aeca4c28b 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -31,7 +31,6 @@ import numpy as np
 
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python import keras
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
@@ -1397,47 +1396,6 @@ class KerasCallbacksTest(test.TestCase):
             callbacks=cbks,
             epochs=1)
 
-  @test_util.run_deprecated_v1
-  def test_fit_generator_with_callback(self):
-
-    class TestCallback(keras.callbacks.Callback):
-
-      def set_model(self, model):
-        # Check the model operations for the optimizer operations that
-        # the _make_train_function adds under a named scope for the
-        # optimizer. This ensurs the full model is populated before the
-        # set_model callback is called.
-        optimizer_name_scope = 'training/' + model.optimizer.__class__.__name__
-        graph_def = ops.get_default_graph().as_graph_def()
-        for node in graph_def.node:
-          if node.name.startswith(optimizer_name_scope):
-            return
-        raise RuntimeError('The optimizer operations are not present in the '
-                           'model graph when the Callback.set_model function '
-                           'is called')
-    np.random.seed(1337)
-
-    def generator():
-      x = np.random.randn(10, 100).astype(np.float32)
-      y = np.random.randn(10, 10).astype(np.float32)
-      while True:
-        yield x, y
-
-    with self.cached_session():
-      model = testing_utils.get_small_sequential_mlp(
-          num_hidden=10, num_classes=10, input_dim=100)
-      model.compile(
-          loss='categorical_crossentropy',
-          optimizer='sgd',
-          metrics=['accuracy'])
-      model.fit_generator(
-          generator(),
-          steps_per_epoch=2,
-          epochs=1,
-          validation_data=generator(),
-          validation_steps=2,
-          callbacks=[TestCallback()],
-          verbose=0)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index 60d0cf3391f6c77bcfd4b877c8162fe0af3d70bc..f1b1f42c229a927347578d27638f587c8885da4c 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import init_ops_v2
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.util import nest
 
@@ -55,7 +56,6 @@ def make_variable(name,
                   shape=None,
                   dtype=dtypes.float32,
                   initializer=None,
-                  partition_info=None,
                   trainable=None,
                   caching_device=None,
                   validate_shape=True,
@@ -76,14 +76,12 @@ def make_variable(name,
   rid of this temporary solution.
 
   TODO(fchollet): remove this method when no longer needed.
-  TODO(fchollet): handle `partitioner` argument.
 
   Arguments:
     name: Variable name.
     shape: Variable shape.
     dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
     initializer: Initializer instance (callable).
-    partition_info: Not handled at this time.
     trainable: Whether the variable should be part of the layer's
       "trainable_variables" (e.g. variables, biases)
       or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
@@ -123,8 +121,9 @@ def make_variable(name,
       # Instantiate initializer if provided initializer is a type object.
       if isinstance(initializer, type(init_ops.Initializer)):
         initializer = initializer(dtype=dtype)
-      init_val = lambda: initializer(  # pylint: disable=g-long-lambda
-          shape, dtype=dtype, partition_info=partition_info)
+      elif isinstance(initializer, type(init_ops_v2.Initializer)):
+        initializer = initializer()
+      init_val = lambda: initializer(shape, dtype=dtype)
       variable_dtype = dtype.base_dtype
   if use_resource is None:
     use_resource = True
diff --git a/tensorflow/python/keras/engine/distributed_training_utils.py b/tensorflow/python/keras/engine/distributed_training_utils.py
index a4fc2cf196b4b324ab46420d93c92e07463aeb65..b6ef33f700e5c973cdd912e1307d76a3b1983552 100644
--- a/tensorflow/python/keras/engine/distributed_training_utils.py
+++ b/tensorflow/python/keras/engine/distributed_training_utils.py
@@ -34,7 +34,6 @@ from tensorflow.python.keras import callbacks
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
@@ -519,100 +518,11 @@ def get_batch_dimension(iterator):
   return dims[0] if dims else None
 
 
-def get_cpu_device(distribution_strategy):
-  """Returns the CPU device of the TPU host or the default CPU device string.
-
-  Args:
-    distribution_strategy: The DistributionStrategy used to compile the model.
-
-  Returns:
-    A device string which is the TPU host's CPU device in case of
-    TPUDistributionStrategy or the default CPU device string in all other
-    cases.
-
-  Raises:
-    NotImplementedError: We currently don't support copying numpy data to
-    multiple hosts in the case of Cloud TPU pods.
-  """
-  if is_tpu_strategy(distribution_strategy):
-    if distribution_strategy.extended.num_hosts > 1:
-      raise NotImplementedError('TPUDistributionStrategy does not '
-                                'support numpy inputs when running on Cloud'
-                                'TPU pods.')
-    return distribution_strategy.extended.get_host_cpu_device(0)
-  else:
-    # For all strategies except TPUDistributionStrategy
-    # TODO(anjalisridhar): We may need to modify this when we add support for
-    # multi-worker strategy.
-    return '/CPU:0'
-
-
-def get_var_for_numpy(distribution_strategy, x):
-  if isinstance(x, list):
-    var_x = tuple([_get_var_for_numpy(distribution_strategy, single_input)
-                   for single_input in x])
-  else:
-    var_x = _get_var_for_numpy(distribution_strategy, x)
-  return var_x
-
-
-def _get_var_for_numpy(distribution_strategy, input_array):
-  """Creates a variable and assigns the value of the numpy array to it.
-
-  Args:
-    distribution_strategy: The DistributionStrategy used to compile the model.
-    input_array: The input numpy array whose value will be assigned to the
-      variable we create.
-
-  Returns:
-    The variable to which we will copy the value of the input numpy array.
-
-  """
-  with ops.device(get_cpu_device(distribution_strategy)):
-    # Create and initialize a variable on the CPU device. This is the CPU
-    # device of the host in the case of TPUDistributionStrategy.
-    input_var = variables.VariableV1(array_ops.zeros(input_array.shape,
-                                                     input_array.dtype),
-                                     trainable=False, use_resource=True)
-  K.get_session().run(input_var.initializer)
-
-  # Create a placeholder for the numpy array input slices. We copy the value
-  # of the input numpy array to the variable in slices of size 64 MB to avoid
-  # running into memory issues or RPC message limits.
-  start_placeholder = array_ops.placeholder(dtypes.int64, ())
-  end_placeholder = array_ops.placeholder(dtypes.int64, ())
-  slice_placeholder = array_ops.placeholder(input_var.dtype)
-  assign_slice_op = input_var[start_placeholder:end_placeholder].assign(
-      slice_placeholder)
-
-  # If each batch element is > 64 MB, then we copy each batch element
-  # individually. Otherwise, the slices will be < 128 MB. There might be padding
-  # which might mean that the slices are 128 MB even if the size of the
-  # tensor allocated is less than 128 MB.
-  # This formula gives slices with size:
-  # ceil(64 MB / byte size per batch element) bytes.
-  # Using ceil() guarantees we get a number >= 1.
-
-  # Calculate the size of each batch element.
-  byte_size_per_batch_element = np.prod(input_array.shape[1:]) * \
-                                input_var.dtype.size
-
-  # Calculate number of elements we want to copy per slice.
-  batch_size_per_slice = int(np.ceil((64 << 20) / byte_size_per_batch_element))
-
-  # Copy slices of the above size starting at 0, except the last slice will be
-  # smaller.
-  start = 0
-  limit = input_array.shape[0]
-  while start < limit:
-    end = min(start + batch_size_per_slice, limit)
-    K.get_session().run(assign_slice_op, feed_dict={
-        start_placeholder: start,
-        end_placeholder: end,
-        slice_placeholder: input_array[start:end]})
-    start = end
-
-  return input_var
+def list_to_tuple(maybe_list):
+  """Datasets treat lists specially, so switch them to tuples."""
+  if isinstance(maybe_list, list):
+    return tuple(maybe_list)
+  return maybe_list
 
 
 def _get_input_from_iterator(iterator, model):
@@ -671,6 +581,12 @@ def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
 
 def _custom_compile_for_predict(model):
   """Custom compile for TPU predict mode."""
+  if not model.built:
+    # Model is not compilable because it does not know its number of inputs
+    # and outputs, nor their shapes and names. We will compile after the first
+    # time the model gets called on training data.
+    return
+  model._is_compiled = True
   model.total_loss = None
   model._fit_function = None
   model._eval_function = None
diff --git a/tensorflow/python/keras/engine/input_layer.py b/tensorflow/python/keras/engine/input_layer.py
index bc2cf2fb6e10e6f80f7f56351e57ae2bc5cea726..c6dcedfce2f620b039fc8cfa7c3366d801e9c176 100644
--- a/tensorflow/python/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/engine/input_layer.py
@@ -77,8 +77,9 @@ class InputLayer(base_layer.Layer):
         dtype = backend.floatx()
       else:
         dtype = backend.dtype(input_tensor)
-    elif input_tensor and input_tensor.dtype != dtype:
-      raise ValueError('`input_tensor.dtype` differs from `dtype`.')
+    elif input_tensor is not None and input_tensor.dtype != dtype:
+      raise ValueError('`input_tensor.dtype` differs from `dtype`: %s vs. %s' %
+                       (input_tensor.dtype, dtype))
     super(InputLayer, self).__init__(dtype=dtype, name=name)
     self.built = True
     self.sparse = sparse
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 0837ce6780c76f51fb4a9cd922faa245744712e2..8e130aef40cdb04757683b7b0e5c6d1ad63c291a 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -1382,9 +1382,10 @@ class Network(base_layer.Layer):
             % (optimizer,))
       self._checkpointable_saver.save(filepath, session=session)
       # Record this checkpoint so it's visible from tf.train.latest_checkpoint.
-      checkpoint_management.update_checkpoint_state(
+      checkpoint_management.update_checkpoint_state_internal(
           save_dir=os.path.dirname(filepath),
           model_checkpoint_path=filepath,
+          save_relative_paths=True,
           all_model_checkpoint_paths=[filepath])
 
   def load_weights(self, filepath, by_name=False):
diff --git a/tensorflow/python/keras/engine/saving.py b/tensorflow/python/keras/engine/saving.py
index 91eba0acabf86f605e111f8d1820471086eb12b5..0604a389a954216b1a7e2ac3a2c2263bf6dad2e4 100644
--- a/tensorflow/python/keras/engine/saving.py
+++ b/tensorflow/python/keras/engine/saving.py
@@ -25,6 +25,7 @@ import os
 import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
 
+from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.utils import conv_utils
@@ -736,9 +737,17 @@ def save_weights_to_hdf5_group(f, layers):
   f.attrs['backend'] = K.backend().encode('utf8')
   f.attrs['keras_version'] = str(keras_version).encode('utf8')
 
+  # On TPUs, modifying the graph between session.runs() triggers some expensive
+  # recompilation overhead. To avoid this, we build up the full set of tensors
+  # to save before fetching weights, thus only modifying the graph once.
+  layer_weights_dict = {}
+  for layer in layers:
+    layer_weights_dict[layer.name] = [ops.convert_to_tensor(w)
+                                      for w in layer.weights]
+
   for layer in layers:
     g = f.create_group(layer.name)
-    symbolic_weights = layer.weights
+    symbolic_weights = layer_weights_dict[layer.name]
     weight_values = K.batch_get_value(symbolic_weights)
     weight_names = []
     for i, (w, val) in enumerate(zip(symbolic_weights, weight_values)):
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index dffd3c8a69ab9de772ac69b3d780f8afa61441af..36270911988508ae169467c896786f59e613a6bb 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -262,16 +262,17 @@ class Sequential(Model):
           with ops.name_scope(layer._name_scope()):
             layer._maybe_build(x)
           layer.built = True
+        if layer.supports_masking:
+          mask = layer.compute_mask(x, mask)
+        else:
+          mask = None
+
         if context.executing_eagerly():
           x = layer(x, **kwargs)
         elif layer.dynamic:
           x = layer._symbolic_call(x)
         else:
           x = layer.call(x, **kwargs)
-        if layer.supports_masking:
-          mask = layer.compute_mask(x, mask)
-        else:
-          mask = None
       if not context.executing_eagerly():
         x._keras_mask = mask
     return x, mask
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index be3afafd3e9c04b51b886848a95397e48ac19cea..0b1743af38d0faa8e44b636f741b1a5beaee4bae 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -19,14 +19,12 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-import weakref
 import numpy as np
 
 from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
@@ -122,10 +120,6 @@ class Model(Network):
 
   def __init__(self, *args, **kwargs):
     super(Model, self).__init__(*args, **kwargs)
-    # Create a cache for iterator get_next op.
-    self._iterator_get_next = weakref.WeakKeyDictionary()
-    # Create a cache for dataset - uninitialized iterators
-    self._dataset_iterator_cache = weakref.WeakKeyDictionary()
     # initializing _distribution_strategy here since it is possible to call
     # predict on a model without compiling it.
     self._distribution_strategy = None
@@ -222,14 +216,14 @@ class Model(Network):
       self._distribution_strategy = distribute
       self._compile_distribution = True
     else:
-      if distribution_strategy_context.has_distribution_strategy():
+      if distribution_strategy_context.has_strategy():
         # When the user builds the model in the DS scope and cross replica
         # context we want distribution strategy to be set but when building the
         # replica copies of the models internally we should not be compiling
         # with distribution strategy and use the default compilation path.
         if distribution_strategy_context.in_cross_replica_context():
           self._distribution_strategy = (
-              distribution_strategy_context.get_distribution_strategy())
+              distribution_strategy_context.get_strategy())
 
     # Validate that arguments passed by the user to `compile` are supported by
     # DistributionStrategy.
@@ -548,6 +542,20 @@ class Model(Network):
       trainable_weights = self.trainable_weights
       self._collected_trainable_weights = trainable_weights
 
+      # Validate all variables were correctly created in distribution scope.
+      if self._distribution_strategy and not self._compile_distribution:
+        for v in self.variables:
+          if v.distribute_strategy is not self._distribution_strategy:
+            raise ValueError(
+                'Variable (%s) was not created in the distribution strategy '
+                'scope of (%s). It is most likely due to not all layers or '
+                'the model or optimizer being created outside the distribution '
+                'strategy scope. Try to make sure your code looks similar '
+                'to the following.\n'
+                'with strategy.scope():\n'
+                '  model=_create_model()\n'
+                '  model.compile(...)'% (v, self._distribution_strategy))
+
   @property
   def metrics(self):
     """Returns the model's metrics added using `compile`, `add_metric` APIs."""
@@ -1233,7 +1241,8 @@ class Model(Network):
                                 'compiled with DistributionStrategy.')
     # Validate and standardize user data.
     x, y, sample_weights = self._standardize_user_data(
-        x, y, sample_weight=sample_weight, class_weight=class_weight)
+        x, y, sample_weight=sample_weight, class_weight=class_weight,
+        extract_tensors_from_dataset=True)
 
     if self.run_eagerly:
       outputs = training_eager.train_on_batch(
@@ -1302,7 +1311,7 @@ class Model(Network):
                                 'compiled with DistributionStrategy.')
     # Validate and standardize user data.
     x, y, sample_weights = self._standardize_user_data(
-        x, y, sample_weight=sample_weight)
+        x, y, sample_weight=sample_weight, extract_tensors_from_dataset=True)
 
     if self.run_eagerly:
       outputs = training_eager.test_on_batch(
@@ -1345,7 +1354,8 @@ class Model(Network):
       raise NotImplementedError('`predict_on_batch` is not supported for '
                                 'models compiled with DistributionStrategy.')
     # Validate and standardize user data.
-    inputs, _, _ = self._standardize_user_data(x)
+    inputs, _, _ = self._standardize_user_data(
+        x, extract_tensors_from_dataset=True)
     if self.run_eagerly:
       if (isinstance(inputs, iterator_ops.EagerIterator) or
           (isinstance(inputs, dataset_ops.DatasetV2))):
@@ -1977,7 +1987,7 @@ class Model(Network):
           ' without calling `model.compile` after ?', 1)
 
   def _make_train_function_helper(self, fn_name, outputs, metric_updates=None):
-    if not hasattr(self, fn_name):
+    if not self._is_compiled:
       raise RuntimeError('You must compile your model before using it.')
     self._check_trainable_weights_consistency()
     if getattr(self, fn_name) is None:
@@ -2026,7 +2036,7 @@ class Model(Network):
         '_fit_function', [self.total_loss] + metrics_tensors)
 
   def _make_test_function_helper(self, fn_name, outputs, metric_updates=None):
-    if not hasattr(self, fn_name):
+    if not self._is_compiled:
       raise RuntimeError('You must compile your model before using it.')
     if getattr(self, fn_name) is None:
       inputs = (self._feed_inputs +
@@ -2089,13 +2099,6 @@ class Model(Network):
       self._make_predict_function()
       return self.predict_function
 
-  def _get_iterator_get_next_tensors(self, iterator):
-    get_next_op = self._iterator_get_next.get(iterator, None)
-    if get_next_op is None:
-      get_next_op = iterator.get_next()
-      self._iterator_get_next[iterator] = get_next_op
-    return get_next_op
-
   def _distribution_standardize_user_data(self,
                                           x,
                                           y=None,
@@ -2158,54 +2161,47 @@ class Model(Network):
                        'you should specify the `{steps_name}` argument.'
                        .format(steps_name=steps_name))
 
-    first_x_value = nest.flatten(x)[0]
-    if isinstance(first_x_value, np.ndarray):
-      # We need to use the drop_remainder argument to allow for a static
-      # input shape which is required for TPUs.
-      drop_remainder = self._distribution_strategy.require_static_shapes
-      if y is not None:
-        var_x = distributed_training_utils.get_var_for_numpy(
-            self._distribution_strategy, x)
-        var_y = distributed_training_utils.get_var_for_numpy(
-            self._distribution_strategy, y)
-        if sample_weight is not None:
-          var_sample_weights = distributed_training_utils.get_var_for_numpy(
-              self._distribution_strategy, sample_weight)
-
-          x = dataset_ops.Dataset.from_tensor_slices((var_x, var_y,
-                                                      var_sample_weights))
+    if ops.executing_eagerly_outside_functions():
+      session = None
+    else:
+      session = K.get_session()
+
+    with self._distribution_strategy.scope():
+      first_x_value = nest.flatten(x)[0]
+      if isinstance(first_x_value, np.ndarray):
+        x = distributed_training_utils.list_to_tuple(x)
+        if y is not None:
+          y = distributed_training_utils.list_to_tuple(y)
+          if sample_weight is not None:
+            sample_weight = distributed_training_utils.list_to_tuple(
+                sample_weight)
+            in_tuple = (x, y, sample_weight)
+          else:
+            in_tuple = (x, y)
         else:
-          x = dataset_ops.Dataset.from_tensor_slices((var_x, var_y))
+          in_tuple = x
 
-        x = dataset_ops.Dataset.from_tensor_slices((var_x, var_y))
         if shuffle:
           # 1024 is a good buffer size since it is much larger than the average
           # batch size provided by the user and provides sufficient randomness.
           # One thing to keep in mind is the memory usage based on the size of
           # each sample.
-          x = x.shuffle(1024)
-        x = x.repeat()
-        x = x.batch(batch_size, drop_remainder=drop_remainder)
-        y = None
-        sample_weight = None
+          shuffle_buffer = 1024
+        else:
+          shuffle_buffer = None
+        iterator = self._distribution_strategy.experimental_make_numpy_iterator(
+            in_tuple, batch_size, num_epochs=None, shuffle=shuffle_buffer,
+            session=session)
       else:
-        # This case is for the predict call where the dataset only contains
-        # inputs and no targets, i.e. it does not return a tuple
-        var_x = distributed_training_utils.get_var_for_numpy(
-            self._distribution_strategy, x)
-        x = dataset_ops.Dataset.from_tensor_slices(var_x)
-        x = x.batch(batch_size, drop_remainder=drop_remainder)
+        assert isinstance(x, dataset_ops.DatasetV2)
+        training_utils.validate_dataset_input(x, y, sample_weight,
+                                              validation_split)
+        iterator = self._distribution_strategy.make_dataset_iterator(x)
 
-    assert isinstance(x, dataset_ops.DatasetV2)
-
-    with self._distribution_strategy.scope():
-      iterator = self._distribution_strategy.make_dataset_iterator(x)
       init_op = iterator.initialize()
       if not context.executing_eagerly():
         K.get_session().run(init_op)
 
-    training_utils.validate_dataset_input(x, y, sample_weight,
-                                          validation_split)
     return iterator
 
   def _standardize_user_data(self,
@@ -2218,7 +2214,8 @@ class Model(Network):
                              steps_name='steps',
                              steps=None,
                              validation_split=0,
-                             shuffle=False):
+                             shuffle=False,
+                             extract_tensors_from_dataset=False):
     """Runs validation checks on input and target data passed by the user.
 
     Also standardizes the data to lists of arrays, in order.
@@ -2262,6 +2259,10 @@ class Model(Network):
       validation_split: Float between 0 and 1.
         Fraction of the training data to be used as validation data.
       shuffle: Boolean whether to shuffle the training data before each epoch.
+      extract_tensors_from_dataset: Boolean. When `x` is a dataset instance,
+        this indicates whether to extract actual tensors from the dataset or
+        instead output the dataset instance itself.
+        Set to True when calling from `train_on_batch`/etc.
 
     Returns:
       A tuple of 3: inputs (arrays or dicts, depending on whether `x` was a dict
@@ -2274,60 +2275,30 @@ class Model(Network):
       ValueError: In case of invalid user-provided data.
       RuntimeError: If the model was never compiled.
     """
-    if isinstance(x, dataset_ops.DatasetV2):
-      if context.executing_eagerly():
-        x = iter(x)
-      else:
-        if x in self._dataset_iterator_cache:
-          x = self._dataset_iterator_cache[x]
-        else:
-          iterator = dataset_ops.make_initializable_iterator(x)
-          self._dataset_iterator_cache[x] = iterator
-          x = iterator
-        K.get_session().run(x.initializer)
+    if isinstance(x, (dataset_ops.DatasetV2, dataset_ops.DatasetV1)):
+      # Graph mode dataset. We'll pass the dataset as-is (unless
+      # `extract_tensors_from_dataset` is True, in which case we extract
+      # the tensors from the dataset and we output them.
+      training_utils.validate_dataset_input(x, y, sample_weight,
+                                            validation_split)
+      is_dataset = True
+      if extract_tensors_from_dataset:
+        # We do this for `train_on_batch`/etc.
+        x, y, sample_weight = training_utils.extract_tensors_from_dataset(x)
+    elif isinstance(x, iterator_ops.Iterator):
+      # Graph mode iterator. We extract the symbolic tensors.
+      training_utils.validate_dataset_input(x, y, sample_weight,
+                                            validation_split)
+      iterator = x
+      x, y, sample_weight = training_utils.unpack_iterator_input(iterator)
+      is_dataset = True
+    else:
+      is_dataset = False
 
     # Validates `steps` argument based on x's type.
     if check_steps:
       training_utils.check_steps_argument(x, steps, steps_name)
 
-    is_x_eager_iterator = isinstance(x, iterator_ops.EagerIterator)
-    is_x_iterator = isinstance(x, iterator_ops.Iterator)
-
-    # Validate user inputs when data is given as a dataset or dataset iterator.
-    if is_x_iterator or is_x_eager_iterator:
-      training_utils.validate_dataset_input(x, y, sample_weight,
-                                            validation_split)
-
-    # For eager iterators, when we have to process multiple batches of samples,
-    # we will standardize the data when we actually loop over iterator and get
-    # the batches. For now, we just return the iterator as is.
-    if is_x_eager_iterator:
-      return x, y, sample_weight
-
-    # If input data is a dataset iterator in graph mode or if it is an eager
-    # iterator and only one batch of samples is required, we fetch the data
-    # tensors from the iterator and then standardize them.
-    if is_x_iterator:
-      try:
-        next_element = self._get_iterator_get_next_tensors(x)
-      except errors.OutOfRangeError:
-        raise RuntimeError('Your dataset iterator ran out of data; '
-                           'Make sure that your dataset can generate '
-                           'required number of samples.')
-
-      if isinstance(next_element, (list, tuple)):
-        if len(next_element) not in [2, 3]:
-          raise ValueError(
-              'Please provide model inputs as a list or tuple of 2  or 3'
-              'elements: (input, target) or (input, target, sample_weights)'
-              'Received %s' % next_element)
-        if len(next_element) == 2:
-          x, y = next_element
-        else:
-          x, y, sample_weight = next_element
-      else:
-        x = next_element
-
     # First, we build/compile the model on the fly if necessary.
     all_inputs = []
     is_build_called = False
@@ -2336,40 +2307,51 @@ class Model(Network):
     # rather than list inputs (e.g. FeatureColumn-based models).
     dict_inputs = False
     if not self.inputs:
-      # We need to use `x` to set the model inputs.
-      # We type-check that `x` and `y` are either single arrays
+      # We need to use `x_input` to set the model inputs.
+
+      # If input data is a dataset iterator in graph mode or if it is an eager
+      # iterator and only one batch of samples is required, we fetch the data
+      # tensors from the iterator and then standardize them.
+      if isinstance(x, (dataset_ops.DatasetV2, dataset_ops.DatasetV1)):
+        x_input, y_input, _ = training_utils.extract_tensors_from_dataset(x)
+      else:
+        x_input = x
+        y_input = y
+      # We type-check that `x_input` and `y_input` are either single arrays
       # or lists of arrays.
-      if isinstance(x, (list, tuple)):
+      if isinstance(x_input, (list, tuple)):
         if not all(isinstance(v, np.ndarray) or
-                   tensor_util.is_tensor(v) for v in x):
+                   tensor_util.is_tensor(v) for v in x_input):
           raise ValueError('Please provide as model inputs either a single '
                            'array or a list of arrays. You passed: x=' + str(x))
-        all_inputs += list(x)
-      elif isinstance(x, dict):
+        all_inputs += list(x_input)
+      elif isinstance(x_input, dict):
         dict_inputs = True
-        keys = sorted(x.keys())
-        all_inputs = [x[k] for k in keys]
+        keys = sorted(x_input.keys())
+        all_inputs = [x_input[k] for k in keys]
       else:
-        if not isinstance(x, np.ndarray) and not tensor_util.is_tensor(x):
+        if (not isinstance(x_input, np.ndarray) and
+            not tensor_util.is_tensor(x_input)):
           raise ValueError('Please provide as model inputs either a single '
                            'array or a list of arrays. You passed: x=' + str(x))
-        all_inputs.append(x)
+        all_inputs.append(x_input)
 
       # Build the model using the retrieved inputs (value or symbolic).
       # If values or generated from a dataset, then in symbolic-mode
       # placeholders will be created to match the value shapes.
       is_build_called = True
-      if is_x_iterator:
-        cast_inputs = nest.map_structure(lambda v: v.shape, x)
-      elif training_utils.has_tensors(x):
-        cast_inputs = training_utils.cast_if_floating_dtype(x)
+      if is_dataset:
+        cast_inputs = nest.map_structure(lambda v: v.shape, x_input)
+      elif training_utils.has_tensors(x_input):
+        cast_inputs = training_utils.cast_if_floating_dtype(x_input)
       else:
-        cast_inputs = x
+        cast_inputs = x_input
       self._set_inputs(cast_inputs)
     else:
+      y_input = y
       dict_inputs = isinstance(self.inputs, dict)
 
-    if y is not None:
+    if y_input is not None:
       if not self.optimizer:
         raise RuntimeError('You must compile a model before '
                            'training/testing. '
@@ -2377,23 +2359,24 @@ class Model(Network):
       if not self._is_compiled:
         # On-the-fly compilation of the model.
         # We need to use `y` to set the model targets.
-        if training_utils.has_tensors(y):
-          y = training_utils.cast_if_floating_dtype(y)
-        if isinstance(y, (list, tuple)):
+        if training_utils.has_tensors(y_input):
+          y_input = training_utils.cast_if_floating_dtype(y_input)
+        if isinstance(y_input, (list, tuple)):
           if not all(isinstance(v, np.ndarray) or
-                     tensor_util.is_tensor(v) for v in y):
+                     tensor_util.is_tensor(v) for v in y_input):
             raise ValueError('Please provide as model targets either a single '
                              'array or a list of arrays. '
                              'You passed: y=' + str(y))
-          all_inputs += list(y)
-        elif isinstance(y, dict):
-          raise ValueError('Please do not pass a dictionary as model targets.')
+          all_inputs += list(y_input)
+        elif isinstance(y_input, dict):
+          raise ValueError('You cannot pass a dictionary as model targets.')
         else:
-          if not isinstance(y, np.ndarray) and not tensor_util.is_tensor(y):
+          if (not isinstance(y_input, np.ndarray) and
+              not tensor_util.is_tensor(y_input)):
             raise ValueError('Please provide as model targets either a single '
                              'array or a list of arrays. '
                              'You passed: y=' + str(y))
-          all_inputs.append(y)
+          all_inputs.append(y_input)
 
         # Typecheck that all inputs are *either* value *or* symbolic.
         # TODO(fchollet): this check could be removed in Eager mode?
@@ -2403,13 +2386,13 @@ class Model(Network):
                              'TensorFlow tensors. '
                              'You passed: x=' + str(x) + '; y=' + str(y))
 
-        if self.run_eagerly or is_x_iterator:
+        if is_dataset or context.executing_eagerly():
           target_tensors = None
         else:
           # Handle target tensors if any passed.
-          if not isinstance(y, (list, tuple)):
-            y = [y]
-          target_tensors = [v for v in y if _is_symbolic_tensor(v)]
+          if not isinstance(y_input, (list, tuple)):
+            y_input = [y_input]
+          target_tensors = [v for v in y_input if _is_symbolic_tensor(v)]
         is_compile_called = True
         self.compile(
             optimizer=self.optimizer,
@@ -2427,7 +2410,7 @@ class Model(Network):
     # Note: in this case, `any` and `all` are equivalent since we disallow
     # mixed symbolic/value inputs.
     if (not self.run_eagerly and is_build_called and is_compile_called and
-        not is_x_iterator and any(_is_symbolic_tensor(v) for v in all_inputs)):
+        not is_dataset  and any(_is_symbolic_tensor(v) for v in all_inputs)):
       return [], [], []
 
     # What follows is input validation and standardization to list format,
@@ -2449,12 +2432,14 @@ class Model(Network):
       feed_input_shapes = self._feed_input_shapes
 
     # Standardize the inputs.
-    x = training_utils.standardize_input_data(
-        x,
-        feed_input_names,
-        feed_input_shapes,
-        check_batch_axis=False,  # Don't enforce the batch size.
-        exception_prefix='input')
+    if not isinstance(x, (dataset_ops.DatasetV2, dataset_ops.DatasetV1)):
+      # TODO(fchollet): run static checks with dataset output shape(s).
+      x = training_utils.standardize_input_data(
+          x,
+          feed_input_names,
+          feed_input_shapes,
+          check_batch_axis=False,  # Don't enforce the batch size.
+          exception_prefix='input')
 
     if y is not None:
       if not self._is_graph_network:
@@ -2528,7 +2513,8 @@ class Model(Network):
                          str(x[0].shape[0]) + ' samples')
 
     # If dictionary inputs were provided, we return a dictionary as well.
-    if dict_inputs:
+    if dict_inputs and not isinstance(x, (dataset_ops.DatasetV2,
+                                          dataset_ops.DatasetV1)):
       x = dict(zip(feed_input_names, x))
     return x, y, sample_weights
 
diff --git a/tensorflow/python/keras/engine/training_arrays.py b/tensorflow/python/keras/engine/training_arrays.py
index 97025a9e18d3eddd591c59d7bd176ab88a68c6a7..ab7d455bfa23e99f646a837e0b3df8c026cdd532 100644
--- a/tensorflow/python/keras/engine/training_arrays.py
+++ b/tensorflow/python/keras/engine/training_arrays.py
@@ -23,6 +23,7 @@ import functools
 
 import numpy as np
 
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
 from tensorflow.python.keras import backend as K
@@ -146,15 +147,15 @@ def model_iteration(model,
 
   Arguments:
       model: Keras Model instance.
-      inputs: Either a list of arrays or a dictionary.
-      targets: List of target arrays.
+      inputs: Either a list or dictionary of arrays, or a dataset instance.
+      targets: List/dictionary of input arrays.
       sample_weights: Optional list of sample weight arrays.
       batch_size: Integer batch size or None if unknown.
       epochs: Number of times to iterate over the data
       verbose: Verbosity mode, 0, 1 or 2
       callbacks: List of callbacks to be called during training
-      val_inputs: List of input arrays.
-      val_targets: List of target arrays.
+      val_inputs: Either a list or dictionary of arrays, or a dataset instance.
+      val_targets: List/dictionary of target arrays.
       val_sample_weights: Optional list of sample weight arrays.
       shuffle: Whether to shuffle the data at the beginning of each epoch
         concatenation of list the display names of the outputs of `f` and the
@@ -186,6 +187,20 @@ def model_iteration(model,
   if 'steps' in kwargs:
     steps_per_epoch = kwargs['steps']
 
+  # In case we are passed datasets, we extract symbolic tensors from them.
+  if isinstance(inputs, (dataset_ops.DatasetV2, dataset_ops.DatasetV1)):
+    inputs, targets, sample_weights = model._standardize_user_data(
+        inputs,
+        steps_name='steps_per_epoch',
+        steps=steps_per_epoch,
+        extract_tensors_from_dataset=True)
+  if isinstance(val_inputs, (dataset_ops.DatasetV2, dataset_ops.DatasetV1)):
+    val_inputs, val_targets, val_sample_weights = model._standardize_user_data(
+        val_inputs,
+        steps_name='validation_steps',
+        steps=validation_steps,
+        extract_tensors_from_dataset=True)
+
   _validate_arguments(steps_per_epoch, validation_steps, kwargs)
   if mode == ModeKeys.TRAIN:
     _print_train_info(inputs, val_inputs, steps_per_epoch, verbose)
@@ -200,6 +215,10 @@ def model_iteration(model,
   use_steps = steps_per_epoch is not None
   do_validation = val_inputs is not None
 
+  # Convert Eager Tensors to NumPy arrays to support batching/shuffling.
+  inputs, targets, sample_weights = training_utils. \
+      convert_eager_tensors_to_numpy((inputs, targets, sample_weights))
+
   # Prepare input data.
   ins = _prepare_feed_values(model, inputs, targets, sample_weights, mode)
   num_samples_or_steps = _get_num_samples_or_steps(ins, batch_size,
diff --git a/tensorflow/python/keras/engine/training_dataset_test.py b/tensorflow/python/keras/engine/training_dataset_test.py
index 646ce319092ae5873565cd4f9b0e7772a9ca4ea3..40d42c13f07f45cb0e598575939423b37843b4b3 100644
--- a/tensorflow/python/keras/engine/training_dataset_test.py
+++ b/tensorflow/python/keras/engine/training_dataset_test.py
@@ -25,7 +25,6 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
@@ -99,29 +98,6 @@ class TestTrainingWithDatasetIterators(keras_parameterized.TestCase):
                                  'the `steps` argument'):
       model.predict(iterator, verbose=0)
 
-  @keras_parameterized.run_with_all_model_types
-  @keras_parameterized.run_all_keras_modes
-  def test_get_next_op_created_once(self):
-    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae']
-    model.compile(optimizer, loss, metrics=metrics,
-                  run_eagerly=testing_utils.should_run_eagerly())
-
-    inputs = np.zeros((10, 3), np.float32)
-    targets = np.zeros((10, 4), np.float32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
-
-    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
-    # Finalize graph to make sure we are not appending another iterator
-    # get_next op in the graph.
-    ops.get_default_graph().finalize()
-    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
-
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_iterators_running_out_of_data(self):
@@ -172,9 +148,6 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
     # Call fit with validation data
     model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
               validation_data=dataset, validation_steps=2)
-    # Finalize the graph to make sure new ops aren't added when calling on the
-    # same dataset
-    ops.get_default_graph().finalize()
     model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
               validation_data=dataset, validation_steps=2)
 
@@ -292,6 +265,34 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
 
       model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
 
+  @keras_parameterized.run_all_keras_modes
+  def test_dataset_fit_correctness(self):
+
+    class SumLayer(keras.layers.Layer):
+
+      def build(self, _):
+        self.w = self.add_weight('w', ())
+
+      def call(self, inputs):
+        return keras.backend.sum(inputs) + self.w * 0
+
+    model = keras.Sequential([SumLayer(input_shape=(2,))])
+    model.compile(RMSPropOptimizer(learning_rate=0.001),
+                  loss='mae',
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    inputs = np.zeros((40, 2), dtype=np.float32)
+    inputs[10:20, :] = 2
+    inputs[20:30, :] = 1
+    inputs[30:, :] = 4
+    targets = np.zeros((40, 1), dtype=np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.batch(10)
+    history = model.fit(dataset,
+                        epochs=2, steps_per_epoch=2, verbose=1, shuffle=False)
+    self.assertListEqual(history.history['loss'],
+                         [inputs[:20].sum() / 2, inputs[20:].sum() / 2])
+
   @tf_test_util.run_deprecated_v1
   def test_dataset_input_shape_validation(self):
     with self.cached_session():
diff --git a/tensorflow/python/keras/engine/training_eager_test.py b/tensorflow/python/keras/engine/training_eager_test.py
index 27eaea23ba09d1405ca16f3beaa2f4c4f4a18661..a6e2c24ec275c75f2641912b7519373ff87f3a97 100644
--- a/tensorflow/python/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/engine/training_eager_test.py
@@ -28,13 +28,20 @@ from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.optimizer_v2 import rmsprop
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
 class TrainingTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_with_all_model_types(exclude_models='sequential')
+  @keras_parameterized.run_all_keras_modes
   def test_model_methods_with_eager_tensors_multi_io(self):
+    if not context.executing_eagerly():
+      # Only test V2 Function and V2 Eager modes, as V1 Graph mode with
+      # symbolic tensors has different requirements.
+      return
+
     input_a = keras.layers.Input(shape=(3,), name='input_a')
     input_b = keras.layers.Input(shape=(3,), name='input_b')
 
@@ -53,13 +60,13 @@ class TrainingTest(keras_parameterized.TestCase):
         loss,
         metrics=metrics,
         loss_weights=loss_weights,
-        run_eagerly=True,
+        run_eagerly=testing_utils.should_run_eagerly(),
         sample_weight_mode=None)
 
-    input_a = keras.backend.zeros(shape=(10, 3))
-    input_b = keras.backend.zeros(shape=(10, 3))
-    target_a = keras.backend.zeros(shape=(10, 4))
-    target_b = keras.backend.zeros(shape=(10, 4))
+    input_a = array_ops.zeros(shape=(10, 3))
+    input_b = array_ops.zeros(shape=(10, 3))
+    target_a = array_ops.zeros(shape=(10, 4))
+    target_b = array_ops.zeros(shape=(10, 4))
 
     model.fit(
         [input_a, input_b], [target_a, target_b],
@@ -107,16 +114,26 @@ class TrainingTest(keras_parameterized.TestCase):
     model.test_on_batch([input_a, input_b], [target_a, target_b])
 
   @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
   def test_model_methods_with_eager_tensors_single_io(self):
+    if not context.executing_eagerly():
+      # Only test V2 Function and V2 Eager modes, as V1 Graph mode with
+      # symbolic tensors has different requirements.
+      return
+
     model = testing_utils.get_small_mlp(10, 4, 3)
 
     optimizer = rmsprop.RMSprop(learning_rate=0.001)
     loss = 'mse'
     metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(optimizer, loss, metrics=metrics, run_eagerly=True)
+    model.compile(
+        optimizer,
+        loss,
+        metrics=metrics,
+        run_eagerly=testing_utils.should_run_eagerly())
 
-    inputs = keras.backend.zeros(shape=(10, 3))
-    targets = keras.backend.zeros(shape=(10, 4))
+    inputs = array_ops.zeros(shape=(10, 3))
+    targets = array_ops.zeros(shape=(10, 4))
 
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=0)
     model.fit(inputs, targets, epochs=1, batch_size=3, verbose=0, shuffle=False)
@@ -134,8 +151,8 @@ class TrainingTest(keras_parameterized.TestCase):
                   loss='mse',
                   run_eagerly=True)
 
-    x = keras.backend.zeros(shape=(10, 3))
-    y = keras.backend.zeros(shape=(10, 4))
+    x = array_ops.zeros(shape=(10, 3))
+    y = array_ops.zeros(shape=(10, 4))
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat(10).batch(5)
     iterator = dataset_ops.make_one_shot_iterator(dataset)
     validation_dataset = dataset_ops.Dataset.from_tensor_slices(
@@ -146,7 +163,7 @@ class TrainingTest(keras_parameterized.TestCase):
         ValueError, r'specify .* `steps_per_epoch`'):
       model.fit(iterator, epochs=1, verbose=0)
     if not context.executing_eagerly():
-      # In eager execution, `keras.backend.zeros` returns value tensors
+      # In eager execution, `array_ops.zeros` returns value tensors
       # which can be used for validation without a `validation_steps` argument.
       with self.assertRaisesRegexp(
           ValueError, r'provide either `batch_size` or `validation_steps`'):
diff --git a/tensorflow/python/keras/engine/training_generator.py b/tensorflow/python/keras/engine/training_generator.py
index d1699b2827dd09f84974e7dabeb126e7d5b51e27..07de93425928a61cc7bd98302aa502fd883bbdf5 100644
--- a/tensorflow/python/keras/engine/training_generator.py
+++ b/tensorflow/python/keras/engine/training_generator.py
@@ -430,12 +430,8 @@ def _make_enqueued_generator(generator,
 def _make_execution_function(model, mode, class_weight=None):
   """Makes function to run one step of model execution."""
   if mode == ModeKeys.TRAIN:
-    if not context.executing_eagerly():
-      model._make_fit_function()
     f = functools.partial(model.train_on_batch, class_weight=class_weight)
   elif mode == ModeKeys.TEST:
-    if not context.executing_eagerly():
-      model._make_eval_function()
     f = model.test_on_batch
   else:
     # Match signature of other modes to allow
diff --git a/tensorflow/python/keras/engine/training_generator_test.py b/tensorflow/python/keras/engine/training_generator_test.py
index 90c45dfcb7fdae23ffba5c0a8e72404f3b9350dd..6b754c18b3d45a66fd704a64e01b425d854d3329 100644
--- a/tensorflow/python/keras/engine/training_generator_test.py
+++ b/tensorflow/python/keras/engine/training_generator_test.py
@@ -66,8 +66,7 @@ class TestGeneratorMethods(keras_parameterized.TestCase):
   @unittest.skipIf(
       os.name == 'nt',
       'use_multiprocessing=True does not work on windows properly.')
-  # TODO(b/120940700): Bug with subclassed model inputs.
-  @keras_parameterized.run_with_all_model_types(exclude_models='subclass')
+  @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_fit_generator_method(self):
     model = testing_utils.get_small_mlp(
@@ -107,8 +106,7 @@ class TestGeneratorMethods(keras_parameterized.TestCase):
   @unittest.skipIf(
       os.name == 'nt',
       'use_multiprocessing=True does not work on windows properly.')
-  # TODO(b/120940700): Bug with subclassed model inputs.
-  @keras_parameterized.run_with_all_model_types(exclude_models='subclass')
+  @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_evaluate_generator_method(self):
     model = testing_utils.get_small_mlp(
@@ -173,8 +171,7 @@ class TestGeneratorMethods(keras_parameterized.TestCase):
                             max_queue_size=10,
                             workers=0)
 
-  # TODO(b/120940700): Bug with subclassed model inputs.
-  @keras_parameterized.run_with_all_model_types(exclude_models='subclass')
+  @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_generator_methods_with_sample_weights(self):
     model = testing_utils.get_small_mlp(
@@ -208,8 +205,7 @@ class TestGeneratorMethods(keras_parameterized.TestCase):
                              max_queue_size=10,
                              use_multiprocessing=False)
 
-  # TODO(b/120940700): Bug with subclassed model inputs.
-  @keras_parameterized.run_with_all_model_types(exclude_models='subclass')
+  @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_generator_methods_invalid_use_case(self):
 
@@ -249,8 +245,7 @@ class TestGeneratorMethods(keras_parameterized.TestCase):
                                max_queue_size=10,
                                use_multiprocessing=False)
 
-  # TODO(b/120940700): Bug with subclassed model inputs.
-  @keras_parameterized.run_with_all_model_types(exclude_models='subclass')
+  @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_generator_input_to_fit_eval_predict(self):
     val_data = np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
@@ -275,8 +270,7 @@ class TestGeneratorMethods(keras_parameterized.TestCase):
 
 class TestGeneratorMethodsWithSequences(keras_parameterized.TestCase):
 
-  # TODO(b/120940700): Bug with subclassed model inputs.
-  @keras_parameterized.run_with_all_model_types(exclude_models='subclass')
+  @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_training_with_sequences(self):
 
@@ -307,8 +301,7 @@ class TestGeneratorMethodsWithSequences(keras_parameterized.TestCase):
                         workers=0,
                         use_multiprocessing=False)
 
-  # TODO(b/120940700): Bug with subclassed model inputs.
-  @keras_parameterized.run_with_all_model_types(exclude_models='subclass')
+  @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_sequence_input_to_fit_eval_predict(self):
     val_data = np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index d07e3cc4f742335ac8c6e753531c4da7e5c447f2..949f1e400d9e9dfbad5cb9d0873a71318bd2f0ee 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -31,6 +31,7 @@ from tensorflow.python.data.ops import readers
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
@@ -1081,10 +1082,10 @@ def is_feature_layer(layer):
 
 
 def is_eager_dataset_or_iterator(data):
-  if context.executing_eagerly():
-    if isinstance(data, (dataset_ops.DatasetV2, iterator_ops.EagerIterator)):
-      return True
-  return False
+  return context.executing_eagerly() and isinstance(
+      data, (dataset_ops.DatasetV1,
+             dataset_ops.DatasetV2,
+             iterator_ops.EagerIterator))
 
 
 # pylint: disable=protected-access
@@ -1199,6 +1200,64 @@ def assert_not_shuffled(dataset):
     raise ValueError('Could not assert that dataset is not shuffled.')
 
 
+def is_dataset_or_iterator(data):
+  return isinstance(data, (dataset_ops.DatasetV1,
+                           dataset_ops.DatasetV2,
+                           iterator_ops.EagerIterator,
+                           iterator_ops.Iterator))
+
+
+def extract_tensors_from_dataset(dataset):
+  """Extract a tuple of tensors `inputs, targets, sample_weight` from a dataset.
+
+  Works only for graph mode.
+
+  Arguments:
+    dataset: Dataset instance.
+
+  Returns:
+    Tuple of tensors `x, y, weights`. `y` and `weights` entry may be None.
+  """
+  iterator = dataset_ops.make_initializable_iterator(dataset)
+  K.get_session().run(iterator.initializer)
+  inputs, targets, sample_weight = unpack_iterator_input(iterator)
+  return inputs, targets, sample_weight
+
+
+def unpack_iterator_input(iterator):
+  """Convert a dataset iterator to a tuple of tensors `x, y, sample_weights`.
+
+  Arguments:
+    iterator: Instance of a dataset iterator.
+
+  Returns:
+    Tuple of tensors `x, y, weights`. `y` and `weights` entry may be None.
+  """
+  try:
+    next_element = iterator.get_next()
+  except errors.OutOfRangeError:
+    raise RuntimeError('Your dataset iterator ran out of data; '
+                       'Make sure that your dataset can generate '
+                       'required number of samples.')
+
+  if isinstance(next_element, (list, tuple)):
+    if len(next_element) not in [2, 3]:
+      raise ValueError(
+          'Please provide model inputs as a list or tuple of 2 or 3 '
+          'elements: (input, target) or (input, target, sample_weights) '
+          'Received %s' % next_element)
+    if len(next_element) == 2:
+      x, y = next_element
+      weights = None
+    else:
+      x, y, weights = next_element
+  else:
+    x = next_element
+    y = None
+    weights = None
+  return x, y, weights
+
+
 class ModelInputs(object):
   """Encapsulates model inputs.
 
@@ -1402,3 +1461,22 @@ def set_run_eagerly_for_dict_structure(model, x):
       if isinstance(item, dict):
         model.run_eagerly = True
         return
+
+
+def convert_eager_tensors_to_numpy(structure):
+  """Convert every EagerTensor in `structure` to NumPy.
+
+  Arguments:
+    structure: An arbitrary structure of elements to be converted to NumPy
+      arrays.
+
+  Returns:
+    An identical structure with EagerTensors converted to NumPy arrays.
+  """
+
+  def _convert(element):
+    if isinstance(element, ops.EagerTensor):
+      return element.numpy()
+    return element
+
+  return nest.map_structure(_convert, structure)
diff --git a/tensorflow/python/keras/layers/advanced_activations.py b/tensorflow/python/keras/layers/advanced_activations.py
index be1039a2ac9510e9acbc7472b584f104a8625033..5095287430735b4d370b0545c3971da14a4c0b6d 100644
--- a/tensorflow/python/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/layers/advanced_activations.py
@@ -121,11 +121,9 @@ class PReLU(Layer):
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
     param_shape = list(input_shape[1:])
-    self.param_broadcast = [False] * len(param_shape)
     if self.shared_axes is not None:
       for i in self.shared_axes:
         param_shape[i - 1] = 1
-        self.param_broadcast[i - 1] = True
     self.alpha = self.add_weight(
         shape=param_shape,
         name='alpha',
@@ -143,12 +141,7 @@ class PReLU(Layer):
 
   def call(self, inputs, mask=None):
     pos = K.relu(inputs)
-    if K.backend() == 'theano':
-      neg = (
-          K.pattern_broadcast(self.alpha, self.param_broadcast) *
-          (inputs - math_ops.abs(inputs)) * 0.5)
-    else:
-      neg = -self.alpha * K.relu(-inputs)
+    neg = -self.alpha * K.relu(-inputs)
     return pos + neg
 
   def get_config(self):
diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index 7251a67191f07c4198728b87db1192aa0e6cc7d9..30b919cc0a9038cf0eeb10a240105fbabd591efa 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -180,12 +180,14 @@ class Conv(Layer):
       op_padding = 'valid'
     else:
       op_padding = self.padding
+    if not isinstance(op_padding, (list, tuple)):
+      op_padding = op_padding.upper()
     self._convolution_op = nn_ops.Convolution(
         input_shape,
         filter_shape=self.kernel.get_shape(),
         dilation_rate=self.dilation_rate,
         strides=self.strides,
-        padding=op_padding.upper(),
+        padding=op_padding,
         data_format=conv_utils.convert_data_format(self.data_format,
                                                    self.rank + 2))
     self.built = True
@@ -199,21 +201,8 @@ class Conv(Layer):
           # nn.bias_add does not accept a 1D input tensor.
           bias = array_ops.reshape(self.bias, (1, self.filters, 1))
           outputs += bias
-        if self.rank == 2:
+        else:
           outputs = nn.bias_add(outputs, self.bias, data_format='NCHW')
-        if self.rank == 3:
-          # As of Mar 2017, direct addition is significantly slower than
-          # bias_add when computing gradients. To use bias_add, we collapse Z
-          # and Y into a single dimension to obtain a 4D input tensor.
-          outputs_shape = outputs.shape.as_list()
-          if outputs_shape[0] is None:
-            outputs_shape[0] = -1
-          outputs_4d = array_ops.reshape(outputs,
-                                         [outputs_shape[0], outputs_shape[1],
-                                          outputs_shape[2] * outputs_shape[3],
-                                          outputs_shape[4]])
-          outputs_4d = nn.bias_add(outputs_4d, self.bias, data_format='NCHW')
-          outputs = array_ops.reshape(outputs_4d, outputs_shape)
       else:
         outputs = nn.bias_add(outputs, self.bias, data_format='NHWC')
 
@@ -1127,24 +1116,10 @@ class Conv3DTranspose(Conv3D):
       outputs.set_shape(out_shape)
 
     if self.use_bias:
-      outputs_shape = outputs.shape.as_list()
-      if outputs_shape[0] is None:
-        outputs_shape[0] = -1
-      if self.data_format == 'channels_first':
-        outputs_4d = array_ops.reshape(outputs, [
-            outputs_shape[0], outputs_shape[1],
-            outputs_shape[2] * outputs_shape[3], outputs_shape[4]
-        ])
-      else:
-        outputs_4d = array_ops.reshape(outputs, [
-            outputs_shape[0], outputs_shape[1] * outputs_shape[2],
-            outputs_shape[3], outputs_shape[4]
-        ])
-      outputs_4d = nn.bias_add(
-          outputs_4d,
+      outputs = nn.bias_add(
+          outputs,
           self.bias,
           data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
-      outputs = array_ops.reshape(outputs_4d, outputs_shape)
 
     if self.activation is not None:
       return self.activation(outputs)
diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index 81af06b4eca3a962d95b59e73dc3148d0312c733..eedf66ff675019674aa769d6edba8366661fb2c7 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -335,6 +335,30 @@ class Conv3DTransposeTest(keras_parameterized.TestCase):
       self.assertEqual(layer.kernel.constraint, k_constraint)
       self.assertEqual(layer.bias.constraint, b_constraint)
 
+  def test_conv3dtranspose_dynamic_shape(self):
+    input_data = np.random.random((1, 3, 3, 3, 3)).astype(np.float32)
+    with self.session(use_gpu=True):
+      # Won't raise error here.
+      testing_utils.layer_test(
+          keras.layers.Conv3DTranspose,
+          kwargs={
+              'data_format': 'channels_last',
+              'filters': 3,
+              'kernel_size': 3
+          },
+          input_shape=(None, None, None, None, 3),
+          input_data=input_data)
+      if test.is_gpu_available(cuda_only=True):
+        testing_utils.layer_test(
+            keras.layers.Conv3DTranspose,
+            kwargs={
+                'data_format': 'channels_first',
+                'filters': 3,
+                'kernel_size': 3
+            },
+            input_shape=(None, 3, None, None, None),
+            input_data=input_data)
+
 
 @keras_parameterized.run_all_keras_modes
 class SeparableConv1DTest(keras_parameterized.TestCase):
@@ -556,6 +580,30 @@ class Conv3DTest(keras_parameterized.TestCase):
       self.assertEqual(layer.kernel.constraint, k_constraint)
       self.assertEqual(layer.bias.constraint, b_constraint)
 
+  def test_conv3d_dynamic_shape(self):
+    input_data = np.random.random((1, 3, 3, 3, 3)).astype(np.float32)
+    with self.session(use_gpu=True):
+      # Won't raise error here.
+      testing_utils.layer_test(
+          keras.layers.Conv3D,
+          kwargs={
+              'data_format': 'channels_last',
+              'filters': 3,
+              'kernel_size': 3
+          },
+          input_shape=(None, None, None, None, 3),
+          input_data=input_data)
+      if test.is_gpu_available(cuda_only=True):
+        testing_utils.layer_test(
+            keras.layers.Conv3D,
+            kwargs={
+                'data_format': 'channels_first',
+                'filters': 3,
+                'kernel_size': 3
+            },
+            input_shape=(None, 3, None, None, None),
+            input_data=input_data)
+
 
 @keras_parameterized.run_all_keras_modes
 class ZeroPaddingTest(keras_parameterized.TestCase):
diff --git a/tensorflow/python/keras/layers/gru_test.py b/tensorflow/python/keras/layers/gru_test.py
index 52a7944d60fd8ab9c36d30c1bf35be4fa5c3051f..e2b65de661643bf93860c4cdaa04da99213d048d 100644
--- a/tensorflow/python/keras/layers/gru_test.py
+++ b/tensorflow/python/keras/layers/gru_test.py
@@ -117,63 +117,6 @@ class GRULayerTest(keras_parameterized.TestCase):
                   run_eagerly=testing_utils.should_run_eagerly())
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
-
-@tf_test_util.run_all_in_graph_and_eager_modes
-class GRULayerGenericTest(test.TestCase):
-
-  def test_constraints_GRU(self):
-    embedding_dim = 4
-    layer_class = keras.layers.GRU
-    k_constraint = keras.constraints.max_norm(0.01)
-    r_constraint = keras.constraints.max_norm(0.01)
-    b_constraint = keras.constraints.max_norm(0.01)
-    layer = layer_class(
-        5,
-        return_sequences=False,
-        weights=None,
-        input_shape=(None, embedding_dim),
-        kernel_constraint=k_constraint,
-        recurrent_constraint=r_constraint,
-        bias_constraint=b_constraint)
-    layer.build((None, None, embedding_dim))
-    self.assertEqual(layer.cell.kernel.constraint, k_constraint)
-    self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
-    self.assertEqual(layer.cell.bias.constraint, b_constraint)
-
-  def test_from_config_GRU(self):
-    layer_class = keras.layers.GRU
-    for stateful in (False, True):
-      l1 = layer_class(units=1, stateful=stateful)
-      l2 = layer_class.from_config(l1.get_config())
-      assert l1.get_config() == l2.get_config()
-
-  def test_regularizers_GRU(self):
-    embedding_dim = 4
-    layer_class = keras.layers.GRU
-    layer = layer_class(
-        5,
-        return_sequences=False,
-        weights=None,
-        input_shape=(None, embedding_dim),
-        kernel_regularizer=keras.regularizers.l1(0.01),
-        recurrent_regularizer=keras.regularizers.l1(0.01),
-        bias_regularizer='l2',
-        activity_regularizer='l1')
-    layer.build((None, None, 2))
-    self.assertEqual(len(layer.losses), 3)
-
-    x = keras.backend.variable(np.ones((2, 3, 2)))
-    layer(x)
-    if context.executing_eagerly():
-      self.assertEqual(len(layer.losses), 4)
-    else:
-      self.assertEqual(len(layer.get_losses_for(x)), 1)
-
-
-class GRULayerV1OnlyTest(test.TestCase):
-
-  @tf_test_util.run_v1_only('b/120941292')
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_statefulness_GRU(self):
     num_samples = 2
     timesteps = 3
@@ -192,7 +135,8 @@ class GRULayerV1OnlyTest(test.TestCase):
     layer = layer_class(
         units, return_sequences=False, stateful=True, weights=None)
     model.add(layer)
-    model.compile(optimizer='sgd', loss='mse')
+    model.compile(optimizer='sgd', loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     out1 = model.predict(np.ones((num_samples, timesteps)))
     self.assertEqual(out1.shape, (num_samples, units))
 
@@ -237,5 +181,57 @@ class GRULayerV1OnlyTest(test.TestCase):
     np.testing.assert_allclose(out7, out6, atol=1e-5)
 
 
+@tf_test_util.run_all_in_graph_and_eager_modes
+class GRULayerGenericTest(test.TestCase):
+
+  def test_constraints_GRU(self):
+    embedding_dim = 4
+    layer_class = keras.layers.GRU
+    k_constraint = keras.constraints.max_norm(0.01)
+    r_constraint = keras.constraints.max_norm(0.01)
+    b_constraint = keras.constraints.max_norm(0.01)
+    layer = layer_class(
+        5,
+        return_sequences=False,
+        weights=None,
+        input_shape=(None, embedding_dim),
+        kernel_constraint=k_constraint,
+        recurrent_constraint=r_constraint,
+        bias_constraint=b_constraint)
+    layer.build((None, None, embedding_dim))
+    self.assertEqual(layer.cell.kernel.constraint, k_constraint)
+    self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
+    self.assertEqual(layer.cell.bias.constraint, b_constraint)
+
+  def test_from_config_GRU(self):
+    layer_class = keras.layers.GRU
+    for stateful in (False, True):
+      l1 = layer_class(units=1, stateful=stateful)
+      l2 = layer_class.from_config(l1.get_config())
+      assert l1.get_config() == l2.get_config()
+
+  def test_regularizers_GRU(self):
+    embedding_dim = 4
+    layer_class = keras.layers.GRU
+    layer = layer_class(
+        5,
+        return_sequences=False,
+        weights=None,
+        input_shape=(None, embedding_dim),
+        kernel_regularizer=keras.regularizers.l1(0.01),
+        recurrent_regularizer=keras.regularizers.l1(0.01),
+        bias_regularizer='l2',
+        activity_regularizer='l1')
+    layer.build((None, None, 2))
+    self.assertEqual(len(layer.losses), 3)
+
+    x = keras.backend.variable(np.ones((2, 3, 2)))
+    layer(x)
+    if context.executing_eagerly():
+      self.assertEqual(len(layer.losses), 4)
+    else:
+      self.assertEqual(len(layer.get_losses_for(x)), 1)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/lstm_test.py b/tensorflow/python/keras/layers/lstm_test.py
index e8c55438c96d47a788fc14d379f224a671ad95c9..38c0177e390cacee3f725ce3ae496f09b2f51c08 100644
--- a/tensorflow/python/keras/layers/lstm_test.py
+++ b/tensorflow/python/keras/layers/lstm_test.py
@@ -23,7 +23,6 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
@@ -340,11 +339,6 @@ class LSTMLayerTest(keras_parameterized.TestCase):
     else:
       self.assertEqual(len(layer.get_losses_for(x)), 1)
 
-
-class LSTMLayerV1OnlyTest(test.TestCase):
-
-  @test_util.run_v1_only('b/120941292')
-  @test_util.run_in_graph_and_eager_modes
   def test_statefulness_LSTM(self):
     num_samples = 2
     timesteps = 3
@@ -363,7 +357,7 @@ class LSTMLayerV1OnlyTest(test.TestCase):
         units, return_sequences=False, stateful=True, weights=None)
     model.add(layer)
     model.compile(optimizer=gradient_descent.GradientDescentOptimizer(0.01),
-                  loss='mse')
+                  loss='mse', run_eagerly=testing_utils.should_run_eagerly())
     out1 = model.predict(np.ones((num_samples, timesteps)))
     self.assertEqual(out1.shape, (num_samples, units))
 
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index 2e68aedbed494cfa82972e7fa58e06510e27dc87..c174c8ddd67cbd092f8b58932a7299c86ceb94f9 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
-
 from tensorflow.python import tf2
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
@@ -34,7 +32,6 @@ from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
@@ -42,6 +39,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
+from tensorflow.python.util.tf_export import tf_export
 
 
 @keras_export('keras.layers.BatchNormalization', v1=[])
@@ -419,20 +417,12 @@ class BatchNormalizationV2(Layer):
       # since TPUStrategy does not implement replica local variables.
       # Remove this hack once we support TPULocalVariables.
       is_tpu_strategy = False
-      if distribution_strategy_context.has_distribution_strategy():
-        distribute = distribution_strategy_context.get_distribution_strategy()
+      if distribution_strategy_context.has_strategy():
+        distribute = distribution_strategy_context.get_strategy()
         if distribute.__class__.__name__ == 'TPUStrategy':
           is_tpu_strategy = True
 
-      # TODO(apassos,srbs,skyewm): the colocation constraints here are disabled
-      # because of a bug which leads cond_v2/while_v2 to skip rewriting them
-      # creating conflicts.
-      if (control_flow_util.EnableControlFlowV2(ops.get_default_graph()) or
-          is_tpu_strategy):
-        cm = contextlib.contextmanager(lambda: (yield))()
-      else:
-        cm = ops.colocate_with(variable)
-      with cm:
+      with ops.colocate_with(variable):
         decay = ops.convert_to_tensor(1.0 - momentum, name='decay')
         if decay.dtype != variable.dtype.base_dtype:
           decay = math_ops.cast(decay, variable.dtype.base_dtype)
@@ -484,7 +474,7 @@ class BatchNormalizationV2(Layer):
       momentum = ops.convert_to_tensor(self.momentum)
     if training_value or training_value is None:
       if distribution_strategy_context.in_cross_replica_context():
-        strategy = distribution_strategy_context.get_distribution_strategy()
+        strategy = distribution_strategy_context.get_strategy()
         mean_update = strategy.extended.update(
             self.moving_mean, self._assign_moving_average,
             (mean, self.momentum))
@@ -676,7 +666,8 @@ class BatchNormalizationV2(Layer):
         scale, offset = _compose_transforms(r, d, scale, offset)
 
       if distribution_strategy_context.in_cross_replica_context():
-        strategy = distribution_strategy_context.get_distribution_strategy()
+        strategy = distribution_strategy_context.get_strategy()
+
         def _do_update(var, value):
           """Compute the updates for mean and variance."""
           if in_eager_mode and not self.trainable:
@@ -797,7 +788,22 @@ class BatchNormalizationV1(BatchNormalizationV2):
   _USE_V2_BEHAVIOR = False
 
 
-if tf2.enabled():
+BatchNormalization = None  # pylint: disable=invalid-name
+
+
+@tf_export(v1=['enable_v2_batch_normalization'])
+def enable_v2_batch_normalization():
+  global BatchNormalization  # pylint: disable=invalid-name
   BatchNormalization = BatchNormalizationV2
-else:
+
+
+@tf_export(v1=['disable_v2_batch_normalization'])
+def disable_v2_batch_normalization():
+  global BatchNormalization  # pylint: disable=invalid-name
   BatchNormalization = BatchNormalizationV1
+
+
+if tf2.enabled():
+  enable_v2_batch_normalization()
+else:
+  disable_v2_batch_normalization()
diff --git a/tensorflow/python/keras/layers/simplernn_test.py b/tensorflow/python/keras/layers/simplernn_test.py
index 390ae789e137d4f7ccc1b687d41227cef18878de..0e599dade94357551552bb4f3b4aa15e6ef8768d 100644
--- a/tensorflow/python/keras/layers/simplernn_test.py
+++ b/tensorflow/python/keras/layers/simplernn_test.py
@@ -22,7 +22,6 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
@@ -141,11 +140,6 @@ class SimpleRNNLayerTest(keras_parameterized.TestCase):
     else:
       self.assertEqual(len(layer.get_losses_for(x)), 1)
 
-
-class SimpleRNNLayerV1OnlyTest(test.TestCase):
-
-  @test_util.run_v1_only('b/120941292')
-  @test_util.run_in_graph_and_eager_modes
   def test_statefulness_SimpleRNN(self):
     num_samples = 2
     timesteps = 3
@@ -164,7 +158,7 @@ class SimpleRNNLayerV1OnlyTest(test.TestCase):
         units, return_sequences=False, stateful=True, weights=None)
     model.add(layer)
     model.compile(optimizer=gradient_descent.GradientDescentOptimizer(0.01),
-                  loss='mse')
+                  loss='mse', run_eagerly=testing_utils.should_run_eagerly())
     out1 = model.predict(np.ones((num_samples, timesteps)))
     self.assertEqual(out1.shape, (num_samples, units))
 
diff --git a/tensorflow/python/keras/layers/unified_gru_test.py b/tensorflow/python/keras/layers/unified_gru_test.py
index 244ffdb8b6cee2fbd3e0128647f639499d50c38a..11322764ac2bd4028ba50667a9150b34dff659b9 100644
--- a/tensorflow/python/keras/layers/unified_gru_test.py
+++ b/tensorflow/python/keras/layers/unified_gru_test.py
@@ -402,39 +402,6 @@ class UnifiedGRUTest(keras_parameterized.TestCase):
     else:
       self.assertEqual(len(layer.get_losses_for(x)), 1)
 
-
-class GRULayerGradientTapeTest(test.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_in_tape(self):
-    if not context.executing_eagerly():
-      self.skipTest('bloo')
-    time_steps = 10
-    embedding_size = 11
-    gru_unit_size = 12
-
-    gru = keras.layers.UnifiedGRU(gru_unit_size,
-                                  return_sequences=True,
-                                  return_state=True,
-                                  recurrent_activation='sigmoid',
-                                  recurrent_initializer='glorot_uniform')
-
-    x = random_ops.random_uniform([1, time_steps, embedding_size])
-    y = random_ops.random_uniform([1, gru_unit_size])
-
-    with backprop.GradientTape() as tape:
-      hidden_state = array_ops.zeros([1, gru_unit_size], dtype=dtypes.float32)
-      _, state = gru(x, initial_state=hidden_state)
-
-      loss = math_ops.reduce_mean(math_ops.square(state - y))
-
-    tape.gradient(loss, gru.variables)
-
-
-class GRULayerV1OnlyTest(test.TestCase, parameterized.TestCase):
-
-  @test_util.run_v1_only('b/120941292')
-  @test_util.run_in_graph_and_eager_modes(config=_config)
   def test_statefulness_GRU(self):
     num_samples = 2
     timesteps = 3
@@ -452,7 +419,8 @@ class GRULayerV1OnlyTest(test.TestCase, parameterized.TestCase):
     layer = layer_class(
         units, return_sequences=False, stateful=True, weights=None)
     model.add(layer)
-    model.compile(optimizer='sgd', loss='mse')
+    model.compile(optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                  loss='mse', run_eagerly=testing_utils.should_run_eagerly())
     out1 = model.predict(np.ones((num_samples, timesteps)))
     self.assertEqual(out1.shape, (num_samples, units))
 
@@ -497,6 +465,34 @@ class GRULayerV1OnlyTest(test.TestCase, parameterized.TestCase):
     np.testing.assert_allclose(out7, out6, atol=1e-5)
 
 
+class GRULayerGradientTapeTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_in_tape(self):
+    if not context.executing_eagerly():
+      self.skipTest('bloo')
+    time_steps = 10
+    embedding_size = 11
+    gru_unit_size = 12
+
+    gru = keras.layers.UnifiedGRU(gru_unit_size,
+                                  return_sequences=True,
+                                  return_state=True,
+                                  recurrent_activation='sigmoid',
+                                  recurrent_initializer='glorot_uniform')
+
+    x = random_ops.random_uniform([1, time_steps, embedding_size])
+    y = random_ops.random_uniform([1, gru_unit_size])
+
+    with backprop.GradientTape() as tape:
+      hidden_state = array_ops.zeros([1, gru_unit_size], dtype=dtypes.float32)
+      _, state = gru(x, initial_state=hidden_state)
+
+      loss = math_ops.reduce_mean(math_ops.square(state - y))
+
+    tape.gradient(loss, gru.variables)
+
+
 class GRULayerGraphOnlyTest(test.TestCase):
 
   # Need session for test
diff --git a/tensorflow/python/keras/layers/unified_lstm_test.py b/tensorflow/python/keras/layers/unified_lstm_test.py
index 7a66e2c2404919222cb7ee1c2183daf4ee869537..375894b166215ed7068767eed095fec2f60963ca 100644
--- a/tensorflow/python/keras/layers/unified_lstm_test.py
+++ b/tensorflow/python/keras/layers/unified_lstm_test.py
@@ -571,6 +571,68 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
     else:
       self.assertEqual(len(layer.get_losses_for(x)), 1)
 
+  def test_statefulness_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    layer_class = keras.layers.UnifiedLSTM
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.Embedding(
+            4,
+            embedding_dim,
+            mask_zero=True,
+            input_length=timesteps,
+            batch_input_shape=(num_samples, timesteps)))
+    layer = layer_class(
+        units, return_sequences=False, stateful=True, weights=None)
+    model.add(layer)
+    model.compile(optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                  loss='mse', run_eagerly=testing_utils.should_run_eagerly())
+    out1 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertEqual(out1.shape, (num_samples, units))
+
+    # train once so that the states change
+    model.train_on_batch(
+        np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
+    out2 = model.predict(np.ones((num_samples, timesteps)))
+
+    # if the state is not reset, output should be different
+    self.assertNotEqual(out1.max(), out2.max())
+
+    # check that output changes after states are reset
+    # (even though the model itself didn't change)
+    layer.reset_states()
+    out3 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertNotEqual(out2.max(), out3.max())
+
+    # check that container-level reset_states() works
+    model.reset_states()
+    out4 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertAllClose(out3, out4, atol=1e-5)
+
+    # check that the call to `predict` updated the states
+    out5 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertNotEqual(out4.max(), out5.max())
+
+    # Check masking
+    layer.reset_states()
+
+    left_padded_input = np.ones((num_samples, timesteps))
+    left_padded_input[0, :1] = 0
+    left_padded_input[1, :2] = 0
+    out6 = model.predict(left_padded_input)
+
+    layer.reset_states()
+
+    right_padded_input = np.ones((num_samples, timesteps))
+    right_padded_input[0, -1:] = 0
+    right_padded_input[1, -2:] = 0
+    out7 = model.predict(right_padded_input)
+
+    self.assertAllClose(out7, out6, atol=1e-5)
+
 
 class LSTMLayerGraphOnlyTest(test.TestCase):
 
@@ -682,7 +744,6 @@ class LSTMLayerGraphOnlyTest(test.TestCase):
 
 class LSTMLayerV1OnlyTest(test.TestCase, parameterized.TestCase):
 
-  @test_util.run_v1_only('b/121278392')
   @test_util.run_in_graph_and_eager_modes(config=_config)
   def test_dropout_LSTM(self):
     num_samples = 2
@@ -698,70 +759,6 @@ class LSTMLayerV1OnlyTest(test.TestCase, parameterized.TestCase):
         },
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  @test_util.run_v1_only('b/120941292')
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_statefulness_LSTM(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    layer_class = keras.layers.UnifiedLSTM
-    model = keras.models.Sequential()
-    model.add(
-        keras.layers.Embedding(
-            4,
-            embedding_dim,
-            mask_zero=True,
-            input_length=timesteps,
-            batch_input_shape=(num_samples, timesteps)))
-    layer = layer_class(
-        units, return_sequences=False, stateful=True, weights=None)
-    model.add(layer)
-    model.compile(
-        optimizer=gradient_descent.GradientDescentOptimizer(0.01), loss='mse')
-    out1 = model.predict(np.ones((num_samples, timesteps)))
-    self.assertEqual(out1.shape, (num_samples, units))
-
-    # train once so that the states change
-    model.train_on_batch(
-        np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
-    out2 = model.predict(np.ones((num_samples, timesteps)))
-
-    # if the state is not reset, output should be different
-    self.assertNotEqual(out1.max(), out2.max())
-
-    # check that output changes after states are reset
-    # (even though the model itself didn't change)
-    layer.reset_states()
-    out3 = model.predict(np.ones((num_samples, timesteps)))
-    self.assertNotEqual(out2.max(), out3.max())
-
-    # check that container-level reset_states() works
-    model.reset_states()
-    out4 = model.predict(np.ones((num_samples, timesteps)))
-    self.assertAllClose(out3, out4, atol=1e-5)
-
-    # check that the call to `predict` updated the states
-    out5 = model.predict(np.ones((num_samples, timesteps)))
-    self.assertNotEqual(out4.max(), out5.max())
-
-    # Check masking
-    layer.reset_states()
-
-    left_padded_input = np.ones((num_samples, timesteps))
-    left_padded_input[0, :1] = 0
-    left_padded_input[1, :2] = 0
-    out6 = model.predict(left_padded_input)
-
-    layer.reset_states()
-
-    right_padded_input = np.ones((num_samples, timesteps))
-    right_padded_input[0, -1:] = 0
-    right_padded_input[1, -2:] = 0
-    out7 = model.predict(right_padded_input)
-
-    self.assertAllClose(out7, out6, atol=1e-5)
-
 
 class UnifiedLSTMPerformanceTest(test.Benchmark):
 
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index 51ae935bb2cb497151485953759784734974ddb3..66780de0f0d73a2c052e77db5d98e4338958099b 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -395,6 +395,7 @@ class CategoricalCrossentropy(Loss):
           y_true, y_pred, from_logits=self.from_logits)
 
 
+@keras_export('keras.losses.Hinge')
 class Hinge(Loss):
   """Computes the hinge loss between `y_true` and `y_pred`.
 
@@ -429,6 +430,7 @@ class Hinge(Loss):
     return hinge(y_true, y_pred)
 
 
+@keras_export('keras.losses.SquaredHinge')
 class SquaredHinge(Loss):
   """Computes the squared hinge loss between `y_true` and `y_pred`.
 
@@ -463,6 +465,7 @@ class SquaredHinge(Loss):
     return squared_hinge(y_true, y_pred)
 
 
+@keras_export('keras.losses.CategoricalHinge')
 class CategoricalHinge(Loss):
   """Computes the categorical hinge loss between `y_true` and `y_pred`.
 
@@ -497,6 +500,161 @@ class CategoricalHinge(Loss):
     return categorical_hinge(y_true, y_pred)
 
 
+class LogLoss(Loss):
+  """Computes the log loss between `y_true` and `y_pred`.
+
+  logloss = - y_true * log(y_pred) - (1 - y_true) * log(1 - y_pred)
+
+  Usage:
+
+  ```python
+  l = tf.losses.LogLoss()
+  loss = l([0., 1., 1.], [1., 0., 1.])
+  print('Loss: ', loss.numpy())  # Loss: 10.745
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.losses.LogLoss())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return logloss(y_true, y_pred)
+
+
+class Poisson(Loss):
+  """Computes the poisson loss between `y_true` and `y_pred`.
+
+  loss = y_pred - y_true * log(y_pred)
+
+  Usage:
+
+  ```python
+  p = tf.losses.Poisson()
+  loss = p([1, 9, 2], [4, 8, 12])
+  print('Loss: ', loss.numpy())  # Loss: -4.63
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.losses.Poisson())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return poisson(y_true, y_pred)
+
+
+class Logcosh(Loss):
+  """Computes the logarithm of the hyperbolic cosine of the prediction error.
+
+  logcosh = log((exp(x) + exp(-x))/2) where x is the error `y_pred` - `y_true`.
+
+  Usage:
+
+  ```python
+  l = tf.losses.Logcosh()
+  loss = l([0., 1., 1.], [1., 0., 1.])
+  print('Loss: ', loss.numpy())  # Loss: 0.289
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.losses.Logcosh())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return logcosh(y_true, y_pred)
+
+
+class KullbackLeiblerDivergence(Loss):
+  """Computes kullback leibler divergence loss between `y_true` and `y_pred`.
+
+  loss = y_true * log(y_true / y_pred)
+
+  Usage:
+
+  ```python
+  k = tf.losses.KullbackLeiblerDivergence()
+  loss = k([.4, .9, .2], [.5, .8, .12])
+  print('Loss: ', loss.numpy())  # Loss: -0.043
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.losses.KullbackLeiblerDivergence())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return kullback_leibler_divergence(y_true, y_pred)
+
+
+class HuberLoss(Loss):
+  """Computes the huber loss between `y_true` and `y_pred`.
+
+  For each value x in `error=y_true-y_pred`, the following is calculated:
+
+    ```
+    0.5 * x^2                  if |x| <= d
+    0.5 * d^2 + d * (|x| - d)  if |x| > d
+    ```
+  where d is `delta`. See: https://en.wikipedia.org/wiki/Huber_loss
+
+  Usage:
+
+  ```python
+  l = tf.losses.HuberLoss()
+  loss = l([0., 1., 1.], [1., 0., 1.])
+  print('Loss: ', loss.numpy())  # Loss: 0.333
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.losses.HuberLoss())
+  ```
+
+  Args:
+    delta: A float, the point where the huber loss function changes from a
+      quadratic to linear.
+    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
+      `SUM_OVER_BATCH_SIZE`.
+    name: Optional name for the op.
+  """
+
+  def __init__(self,
+               delta=1.0,
+               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name=None):
+    super(HuberLoss, self).__init__(reduction=reduction, name=name)
+    self.delta = delta
+
+  def call(self, y_true, y_pred):
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return huber_loss(y_true, y_pred, delta=self.delta)
+
+
 @keras_export('keras.metrics.mean_squared_error',
               'keras.metrics.mse',
               'keras.metrics.MSE',
@@ -559,6 +717,46 @@ def categorical_hinge(y_true, y_pred):
   return math_ops.maximum(0., neg - pos + 1.)
 
 
+def logloss(y_true, y_pred):
+  losses = math_ops.multiply(y_true, math_ops.log(y_pred + K.epsilon()))
+  losses += math_ops.multiply((1 - y_true),
+                              math_ops.log(1 - y_pred + K.epsilon()))
+  return K.mean(-losses, axis=-1)
+
+
+def huber_loss(y_true, y_pred, delta=1.0):
+  """Computes huber loss value.
+
+  For each value x in `error=y_true-y_pred`, the following is calculated:
+
+    ```
+    0.5 * x^2                  if |x| <= d
+    0.5 * d^2 + d * (|x| - d)  if |x| > d
+    ```
+  where d is `delta`. See: https://en.wikipedia.org/wiki/Huber_loss
+
+  Args:
+    y_true: tensor of true targets.
+    y_pred: tensor of predicted targets.
+    delta: A float, the point where the huber loss function changes from a
+      quadratic to linear.
+
+  Returns:
+    Tensor with one scalar loss entry per sample.
+  """
+  y_pred = math_ops.cast(y_pred, dtype=K.floatx())
+  y_true = math_ops.cast(y_true, dtype=K.floatx())
+  error = math_ops.subtract(y_pred, y_true)
+  abs_error = math_ops.abs(error)
+  quadratic = math_ops.minimum(abs_error, delta)
+  linear = math_ops.subtract(abs_error, quadratic)
+  return math_ops.add(
+      math_ops.multiply(
+          ops.convert_to_tensor(0.5, dtype=quadratic.dtype),
+          math_ops.multiply(quadratic, quadratic)),
+      math_ops.multiply(delta, linear))
+
+
 @keras_export('keras.losses.logcosh')
 def logcosh(y_true, y_pred):
   """Logarithm of the hyperbolic cosine of the prediction error.
@@ -623,14 +821,15 @@ def poisson(y_true, y_pred):
               'keras.metrics.cosine',
               'keras.losses.cosine_proximity',
               'keras.losses.cosine')
-def cosine_proximity(y_true, y_pred):
-  y_true = nn.l2_normalize(y_true, axis=-1)
-  y_pred = nn.l2_normalize(y_pred, axis=-1)
-  return -math_ops.reduce_sum(y_true * y_pred, axis=-1)
+def cosine_proximity(y_true, y_pred, axis=-1):
+  y_true = nn.l2_normalize(y_true, axis=axis)
+  y_pred = nn.l2_normalize(y_pred, axis=axis)
+  return -math_ops.reduce_sum(y_true * y_pred, axis=axis)
 
 
+@keras_export('keras.losses.CosineProximity')
 class CosineProximity(Loss):
-  """Computes the cosine distance between `y_true` and `y_pred`.
+  """Computes the cosine proximity between `y_true` and `y_pred`.
 
   Usage:
 
@@ -646,8 +845,22 @@ class CosineProximity(Loss):
   model = keras.models.Model(inputs, outputs)
   model.compile('sgd', loss=tf.losses.CosineProximity())
   ```
+
+  Args:
+    axis: (Optional) Defaults to -1. The dimension along which the cosine
+      proximity is computed.
+    reduction: (Optional) Type of `tf.losses.Reduction` to apply to loss.
+      Default value is `SUM_OVER_BATCH_SIZE`.
+    name: Optional name for the op.
   """
 
+  def __init__(self,
+               axis=-1,
+               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name=None):
+    super(CosineProximity, self).__init__(reduction=reduction, name=name)
+    self.axis = axis
+
   def call(self, y_true, y_pred):
     """Calculates the cosine proximity loss.
 
@@ -660,7 +873,7 @@ class CosineProximity(Loss):
     """
     y_pred = ops.convert_to_tensor(y_pred)
     y_true = math_ops.cast(y_true, y_pred.dtype)
-    return cosine_proximity(y_true, y_pred)
+    return cosine_proximity(y_true, y_pred, axis=self.axis)
 
 
 # Aliases.
diff --git a/tensorflow/python/keras/losses_test.py b/tensorflow/python/keras/losses_test.py
index 19ed7c8ed9d599585521fefa2ce1f725e1c4dd97..004c30f84d922e65ab7be07ebd64271c1e2af755 100644
--- a/tensorflow/python/keras/losses_test.py
+++ b/tensorflow/python/keras/losses_test.py
@@ -485,59 +485,88 @@ class MeanSquaredLogarithmicErrorTest(test.TestCase):
 @test_util.run_all_in_graph_and_eager_modes
 class CosineProximityTest(test.TestCase):
 
+  def l2_norm(self, x, axis):
+    epsilon = 1e-12
+    square_sum = np.sum(np.square(x), axis=axis, keepdims=True)
+    x_inv_norm = 1 / np.sqrt(np.maximum(square_sum, epsilon))
+    return np.multiply(x, x_inv_norm)
+
+  def setup(self, axis=1):
+    self.np_y_true = np.asarray([[1, 9, 2], [-5, -2, 6]], dtype=np.float32)
+    self.np_y_pred = np.asarray([[4, 8, 12], [8, 1, 3]], dtype=np.float32)
+
+    y_true = self.l2_norm(self.np_y_true, axis)
+    y_pred = self.l2_norm(self.np_y_pred, axis)
+    self.expected_loss = -np.sum(np.multiply(y_true, y_pred), axis=(axis,))
+
+    self.y_true = constant_op.constant(self.np_y_true)
+    self.y_pred = constant_op.constant(self.np_y_pred)
+
   def test_config(self):
     cosine_obj = keras.losses.CosineProximity(
-        reduction=losses_impl.ReductionV2.SUM, name='cosine_loss')
+        axis=2, reduction=losses_impl.ReductionV2.SUM, name='cosine_loss')
     self.assertEqual(cosine_obj.name, 'cosine_loss')
     self.assertEqual(cosine_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(cosine_obj.axis, 2)
 
   def test_unweighted(self):
+    self.setup()
     cosine_obj = keras.losses.CosineProximity()
-    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float32)
-    loss = cosine_obj(y_true, y_pred)
-    self.assertAlmostEqual(self.evaluate(loss), -0.18722, 3)
+    loss = cosine_obj(self.y_true, self.y_pred)
+    expected_loss = np.mean(self.expected_loss)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
 
   def test_scalar_weighted(self):
+    self.setup()
     cosine_obj = keras.losses.CosineProximity()
-    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float32)
-    loss = cosine_obj(y_true, y_pred, sample_weight=2.3)
-    self.assertAlmostEqual(self.evaluate(loss), -0.43060, 3)
+    sample_weight = 2.3
+    loss = cosine_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    expected_loss = np.mean(self.expected_loss * sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
 
   def test_sample_weighted(self):
+    self.setup()
     cosine_obj = keras.losses.CosineProximity()
-    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float32)
-    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
-    loss = cosine_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 0.15599, 3)
+    sample_weight = np.asarray([1.2, 3.4])
+    loss = cosine_obj(
+        self.y_true,
+        self.y_pred,
+        sample_weight=constant_op.constant(sample_weight))
+    expected_loss = np.mean(self.expected_loss * sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
 
   def test_timestep_weighted(self):
+    self.setup()
     cosine_obj = keras.losses.CosineProximity()
-    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
-    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
-                                  shape=(2, 3, 1),
-                                  dtype=dtypes.float32)
-    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
-    loss = cosine_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), -2.0000, 3)
+    np_y_true = self.np_y_true.reshape((2, 3, 1))
+    np_y_pred = self.np_y_pred.reshape((2, 3, 1))
+    sample_weight = np.asarray([3, 6, 5, 0, 4, 2]).reshape((2, 3))
+
+    y_true = self.l2_norm(np_y_true, 2)
+    y_pred = self.l2_norm(np_y_pred, 2)
+    expected_loss = -np.sum(np.multiply(y_true, y_pred), axis=(2,))
+
+    y_true = constant_op.constant(np_y_true)
+    y_pred = constant_op.constant(np_y_pred)
+    loss = cosine_obj(
+        y_true, y_pred, sample_weight=constant_op.constant(sample_weight))
+
+    expected_loss = np.mean(expected_loss * sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
 
   def test_zero_weighted(self):
+    self.setup()
     cosine_obj = keras.losses.CosineProximity()
-    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float32)
-    loss = cosine_obj(y_true, y_pred, sample_weight=0)
+    loss = cosine_obj(self.y_true, self.y_pred, sample_weight=0)
     self.assertAlmostEqual(self.evaluate(loss), 0., 3)
 
+  def test_axis(self):
+    self.setup(axis=1)
+    cosine_obj = keras.losses.CosineProximity(axis=1)
+    loss = cosine_obj(self.y_true, self.y_pred)
+    expected_loss = np.mean(self.expected_loss)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class BinaryCrossentropyTest(test.TestCase):
@@ -1003,5 +1032,439 @@ class CategoricalHingeTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 0., 3)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class LogLossTest(test.TestCase):
+
+  def setup(self):
+    # TODO(psv): Change to setUp() after b/122319309 is fixed.
+    y_pred = np.asarray([.9, .2, .2, .8, .4, .6]).reshape((2, 3))
+    y_true = np.asarray([1., 0., 1., 1., 0., 0.]).reshape((2, 3))
+    epsilon = 1e-7  # to avoid log 0
+
+    self.batch_size = 6
+    self.expected_losses = np.multiply(y_true, np.log(y_pred + epsilon))
+    self.expected_losses += np.multiply(1 - y_true,
+                                        np.log(1 - y_pred + epsilon))
+    self.expected_losses = -self.expected_losses
+
+    self.y_pred = constant_op.constant(y_pred)
+    self.y_true = constant_op.constant(y_true)
+
+  def test_config(self):
+    log_loss_obj = keras.losses.LogLoss(
+        reduction=losses_impl.ReductionV2.SUM, name='log')
+    self.assertEqual(log_loss_obj.name, 'log')
+    self.assertEqual(log_loss_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_all_correct(self):
+    self.setup()
+    log_loss_obj = keras.losses.LogLoss()
+    loss = log_loss_obj(self.y_true, self.y_true)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_unweighted(self):
+    self.setup()
+    log_loss_obj = keras.losses.LogLoss()
+    loss = log_loss_obj(self.y_true, self.y_pred)
+    actual_loss = np.sum(self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+  def test_scalar_weighted(self):
+    self.setup()
+    log_loss_obj = keras.losses.LogLoss()
+    sample_weight = 2.3
+    loss = log_loss_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    actual_loss = sample_weight * np.sum(self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+    # Verify we get the same output when the same input is given
+    loss_2 = log_loss_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+  def test_sample_weighted(self):
+    self.setup()
+    log_loss_obj = keras.losses.LogLoss()
+    sample_weight = constant_op.constant((1.2, 3.4), shape=(2, 1))
+
+    loss = log_loss_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    actual_loss = np.multiply(
+        self.expected_losses,
+        np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3)))
+    actual_loss = np.sum(actual_loss) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+  def test_timestep_weighted(self):
+    log_loss_obj = keras.losses.LogLoss()
+
+    y_pred = np.asarray([.9, .2, .2, .8, .4, .6]).reshape((2, 3, 1))
+    y_true = np.asarray([1., 0., 1., 1., 0., 0.]).reshape((2, 3, 1))
+    epsilon = 1e-7  # to avoid log 0
+    batch_size = 6
+
+    expected_losses = np.multiply(y_true, np.log(y_pred + epsilon))
+    expected_losses += np.multiply(1 - y_true, np.log(1 - y_pred + epsilon))
+
+    y_pred = constant_op.constant(y_pred)
+    y_true = constant_op.constant(y_true)
+    sample_weight = np.array([3, 6, 5, 0, 4, 2]).reshape((2, 3, 1))
+    loss = log_loss_obj(
+        y_true,
+        y_pred,
+        sample_weight=constant_op.constant(sample_weight, shape=(2, 3)))
+    actual_loss = np.multiply(-expected_losses, sample_weight)
+    actual_loss = np.sum(actual_loss) / batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+  def test_zero_weighted(self):
+    self.setup()
+    log_loss_obj = keras.losses.LogLoss()
+    sample_weight = 0
+    loss = log_loss_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class LogcoshTest(test.TestCase):
+
+  def setup(self):
+    y_pred = np.asarray([1, 9, 2, -5, -2, 6]).reshape((2, 3))
+    y_true = np.asarray([4, 8, 12, 8, 1, 3]).reshape((2, 3))
+
+    self.batch_size = 6
+    error = y_pred - y_true
+    self.expected_losses = np.log((np.exp(error) + np.exp(-error)) / 2)
+
+    self.y_pred = constant_op.constant(y_pred, dtype=dtypes.float32)
+    self.y_true = constant_op.constant(y_true)
+
+  def test_config(self):
+    logcosh_obj = keras.losses.Logcosh(
+        reduction=losses_impl.ReductionV2.SUM, name='logcosh_loss')
+    self.assertEqual(logcosh_obj.name, 'logcosh_loss')
+    self.assertEqual(logcosh_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_unweighted(self):
+    self.setup()
+    logcosh_obj = keras.losses.Logcosh()
+
+    loss = logcosh_obj(self.y_true, self.y_pred)
+    expected_loss = np.sum(self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_scalar_weighted(self):
+    self.setup()
+    logcosh_obj = keras.losses.Logcosh()
+    sample_weight = 2.3
+
+    loss = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    expected_loss = sample_weight * np.sum(
+        self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+    # Verify we get the same output when the same input is given
+    loss_2 = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+  def test_sample_weighted(self):
+    self.setup()
+    logcosh_obj = keras.losses.Logcosh()
+
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+
+    expected_loss = np.multiply(
+        self.expected_losses,
+        np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3)))
+    expected_loss = np.sum(expected_loss) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_timestep_weighted(self):
+    self.setup()
+    logcosh_obj = keras.losses.Logcosh()
+    y_true = np.asarray([1, 9, 2, -5, -2, 6]).reshape(2, 3, 1)
+    y_pred = np.asarray([4, 8, 12, 8, 1, 3]).reshape(2, 3, 1)
+    error = y_pred - y_true
+    expected_losses = np.log((np.exp(error) + np.exp(-error)) / 2)
+    sample_weight = np.array([3, 6, 5, 0, 4, 2]).reshape((2, 3, 1))
+
+    y_pred = constant_op.constant(y_pred, dtype=dtypes.float32)
+    y_true = constant_op.constant(y_true)
+    loss = logcosh_obj(
+        y_true,
+        y_pred,
+        sample_weight=constant_op.constant(sample_weight, shape=(2, 3)))
+    expected_loss = np.sum(expected_losses * sample_weight) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_zero_weighted(self):
+    self.setup()
+    logcosh_obj = keras.losses.Logcosh()
+    sample_weight = 0
+    loss = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class PoissonTest(test.TestCase):
+
+  def setup(self):
+    self.np_y_pred = np.asarray([1, 9, 2, 5, 2, 6]).reshape((2, 3))
+    self.np_y_true = np.asarray([4, 8, 12, 8, 1, 3]).reshape((2, 3))
+
+    self.batch_size = 6
+    self.expected_losses = self.np_y_pred - np.multiply(self.np_y_true,
+                                                        np.log(self.np_y_pred))
+
+    self.y_pred = constant_op.constant(self.np_y_pred, dtype=dtypes.float32)
+    self.y_true = constant_op.constant(self.np_y_true)
+
+  def test_config(self):
+    poisson_obj = keras.losses.Poisson(
+        reduction=losses_impl.ReductionV2.SUM, name='poisson')
+    self.assertEqual(poisson_obj.name, 'poisson')
+    self.assertEqual(poisson_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_unweighted(self):
+    self.setup()
+    poisson_obj = keras.losses.Poisson()
+
+    loss = poisson_obj(self.y_true, self.y_pred)
+    expected_loss = np.sum(self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_scalar_weighted(self):
+    self.setup()
+    poisson_obj = keras.losses.Poisson()
+    sample_weight = 2.3
+    loss = poisson_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+
+    expected_loss = sample_weight * np.sum(
+        self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+    # Verify we get the same output when the same input is given
+    loss_2 = poisson_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+  def test_sample_weighted(self):
+    self.setup()
+    poisson_obj = keras.losses.Poisson()
+
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = poisson_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+
+    expected_loss = np.multiply(
+        self.expected_losses,
+        np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3)))
+    expected_loss = np.sum(expected_loss) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_timestep_weighted(self):
+    self.setup()
+    poisson_obj = keras.losses.Poisson()
+    y_true = self.np_y_true.reshape(2, 3, 1)
+    y_pred = self.np_y_pred.reshape(2, 3, 1)
+    sample_weight = np.asarray([3, 6, 5, 0, 4, 2]).reshape(2, 3, 1)
+    expected_losses = y_pred - np.multiply(y_true, np.log(y_pred))
+
+    y_pred = constant_op.constant(y_pred, dtype=dtypes.float32)
+    y_true = constant_op.constant(y_true)
+
+    loss = poisson_obj(
+        y_true,
+        y_pred,
+        sample_weight=constant_op.constant(sample_weight, shape=(2, 3)))
+    expected_loss = np.sum(expected_losses * sample_weight) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_zero_weighted(self):
+    self.setup()
+    poisson_obj = keras.losses.Poisson()
+    loss = poisson_obj(self.y_true, self.y_pred, sample_weight=0)
+    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class KullbackLeiblerDivergenceTest(test.TestCase):
+
+  def setup(self):
+    self.np_y_pred = np.asarray([.4, .9, .12, .36, .3, .4]).reshape((2, 3))
+    self.np_y_true = np.asarray([.5, .8, .12, .7, .43, .8]).reshape((2, 3))
+
+    self.batch_size = 2
+    self.expected_losses = np.multiply(self.np_y_true,
+                                       np.log(self.np_y_true / self.np_y_pred))
+
+    self.y_pred = constant_op.constant(self.np_y_pred, dtype=dtypes.float32)
+    self.y_true = constant_op.constant(self.np_y_true)
+
+  def test_config(self):
+    k_obj = keras.losses.KullbackLeiblerDivergence(
+        reduction=losses_impl.ReductionV2.SUM, name='kld')
+    self.assertEqual(k_obj.name, 'kld')
+    self.assertEqual(k_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_unweighted(self):
+    self.setup()
+    k_obj = keras.losses.KullbackLeiblerDivergence()
+
+    loss = k_obj(self.y_true, self.y_pred)
+    expected_loss = np.sum(self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_scalar_weighted(self):
+    self.setup()
+    k_obj = keras.losses.KullbackLeiblerDivergence()
+    sample_weight = 2.3
+
+    loss = k_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    expected_loss = sample_weight * np.sum(
+        self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+    # Verify we get the same output when the same input is given
+    loss_2 = k_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+  def test_sample_weighted(self):
+    self.setup()
+    k_obj = keras.losses.KullbackLeiblerDivergence()
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = k_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+
+    expected_loss = np.multiply(
+        self.expected_losses,
+        np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape(2, 3))
+    expected_loss = np.sum(expected_loss) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_timestep_weighted(self):
+    self.setup()
+    k_obj = keras.losses.KullbackLeiblerDivergence()
+    y_true = self.np_y_true.reshape(2, 3, 1)
+    y_pred = self.np_y_pred.reshape(2, 3, 1)
+    sample_weight = np.asarray([3, 6, 5, 0, 4, 2]).reshape(2, 3)
+    expected_losses = np.sum(
+        np.multiply(y_true, np.log(y_true / y_pred)), axis=-1)
+
+    y_pred = constant_op.constant(y_pred, dtype=dtypes.float32)
+    y_true = constant_op.constant(y_true)
+    loss = k_obj(
+        y_true, y_pred, sample_weight=constant_op.constant(sample_weight))
+
+    num_timesteps = 3
+    expected_loss = np.sum(expected_losses * sample_weight) / (
+        self.batch_size * num_timesteps)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_zero_weighted(self):
+    self.setup()
+    k_obj = keras.losses.KullbackLeiblerDivergence()
+    loss = k_obj(self.y_true, self.y_pred, sample_weight=0)
+    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class HuberLossTest(test.TestCase):
+
+  def huber_loss(self, y_true, y_pred, delta=1.0):
+    error = y_pred - y_true
+    abs_error = np.abs(error)
+
+    quadratic = np.minimum(abs_error, delta)
+    linear = np.subtract(abs_error, quadratic)
+    return np.add(
+        np.multiply(0.5, np.multiply(quadratic, quadratic)),
+        np.multiply(delta, linear))
+
+  def setup(self, delta=1.0):
+    self.np_y_pred = np.asarray([.9, .2, .2, .8, .4, .6]).reshape((2, 3))
+    self.np_y_true = np.asarray([1., 0., 1., 1., 0., 0.]).reshape((2, 3))
+
+    self.batch_size = 6
+    self.expected_losses = self.huber_loss(self.np_y_true, self.np_y_pred,
+                                           delta)
+
+    self.y_pred = constant_op.constant(self.np_y_pred)
+    self.y_true = constant_op.constant(self.np_y_true)
+
+  def test_config(self):
+    h_obj = keras.losses.HuberLoss(
+        reduction=losses_impl.ReductionV2.SUM, name='huber')
+    self.assertEqual(h_obj.name, 'huber')
+    self.assertEqual(h_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_all_correct(self):
+    self.setup()
+    h_obj = keras.losses.HuberLoss()
+    loss = h_obj(self.y_true, self.y_true)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_unweighted(self):
+    self.setup()
+    h_obj = keras.losses.HuberLoss()
+    loss = h_obj(self.y_true, self.y_pred)
+    actual_loss = np.sum(self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+  def test_scalar_weighted(self):
+    self.setup()
+    h_obj = keras.losses.HuberLoss()
+    sample_weight = 2.3
+    loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    actual_loss = sample_weight * np.sum(self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+    # Verify we get the same output when the same input is given
+    loss_2 = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+  def test_sample_weighted(self):
+    self.setup()
+    h_obj = keras.losses.HuberLoss()
+    sample_weight = constant_op.constant((1.2, 3.4), shape=(2, 1))
+
+    loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    actual_loss = np.multiply(
+        self.expected_losses,
+        np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3)))
+    actual_loss = np.sum(actual_loss) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+  def test_timestep_weighted(self):
+    self.setup()
+    h_obj = keras.losses.HuberLoss()
+    y_pred = self.np_y_pred.reshape((2, 3, 1))
+    y_true = self.np_y_true.reshape((2, 3, 1))
+    expected_losses = self.huber_loss(y_true, y_pred)
+
+    y_pred = constant_op.constant(y_pred)
+    y_true = constant_op.constant(y_true)
+    sample_weight = np.array([3, 6, 5, 0, 4, 2]).reshape((2, 3, 1))
+    loss = h_obj(
+        y_true,
+        y_pred,
+        sample_weight=constant_op.constant(sample_weight, shape=(2, 3)))
+    actual_loss = np.multiply(expected_losses, sample_weight)
+    actual_loss = np.sum(actual_loss) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+  def test_zero_weighted(self):
+    self.setup()
+    h_obj = keras.losses.HuberLoss()
+    sample_weight = 0
+    loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
+
+  def test_non_default_delta(self):
+    self.setup(delta=0.8)
+    h_obj = keras.losses.HuberLoss(delta=0.8)
+    sample_weight = 2.3
+    loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    actual_loss = sample_weight * np.sum(self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index 707332350614d0c19265129ec2a27b42889a5cba..a13d8747d61af468036cdbab39883b75266856c9 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -51,6 +51,7 @@ from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import to_list
 from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import confusion_matrix
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
@@ -346,6 +347,85 @@ class Mean(Metric):
     return math_ops.div_no_nan(self.total, self.count)
 
 
+class MeanRelativeError(Mean):
+  """Computes the mean relative error by normalizing with the given values.
+
+  This metric creates two local variables, `total` and `count` that are used to
+  compute the mean relative absolute error. This average is weighted by
+  `sample_weight`, and it is ultimately returned as `mean_relative_error`:
+  an idempotent operation that simply divides `total` by `count`.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.MeanRelativeError(normalizer=[1, 3, 2, 3])
+  m.update_state([1, 3, 2, 3], [2, 4, 6, 8])
+
+  # metric = mean(|y_pred - y_true| / normalizer)
+  #        = mean([1, 1, 4, 5] / [1, 3, 2, 3]) = mean([1, 1/3, 2, 5/3])
+  #        = 5/4 = 1.25
+  print('Final result: ', m.result().numpy())  # Final result: 1.25
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile(
+    'sgd',
+    loss='mse',
+    metrics=[tf.keras.metrics.MeanRelativeError(normalizer=[1, 3])])
+  ```
+  """
+
+  def __init__(self, normalizer, name=None, dtype=None):
+    """Creates a `MeanRelativeError` instance.
+
+    Args:
+      normalizer: The normalizer values with same shape as predictions.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(MeanRelativeError, self).__init__(name=name, dtype=dtype)
+    normalizer = math_ops.cast(normalizer, self._dtype)
+    self.normalizer = normalizer
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates metric statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    y_true = math_ops.cast(y_true, self._dtype)
+    y_pred = math_ops.cast(y_pred, self._dtype)
+    y_pred, y_true, sample_weight = squeeze_or_expand_dimensions(
+        y_pred, y_true, sample_weight)
+
+    y_pred, self.normalizer = confusion_matrix.remove_squeezable_dimensions(
+        y_pred, self.normalizer)
+    y_pred.shape.assert_is_compatible_with(y_pred.shape)
+    relative_errors = math_ops.div_no_nan(
+        math_ops.abs(y_true - y_pred), self.normalizer)
+
+    return super(MeanRelativeError, self).update_state(
+        relative_errors, sample_weight=sample_weight)
+
+  def get_config(self):
+    config = {'normalizer': self.normalizer}
+    base_config = super(MeanRelativeError, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
 class MeanMetricWrapper(Mean):
   """Wraps a stateless metric function with the Mean metric."""
 
@@ -590,6 +670,84 @@ class SparseCategoricalAccuracy(MeanMetricWrapper):
     return super(SparseCategoricalAccuracy, cls).from_config(config)
 
 
+class TopKCategoricalAccuracy(MeanMetricWrapper):
+  """Computes how often targets are in the top `K` predictions.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.TopKCategoricalAccuracy()
+  m.update_state([[0, 0, 1], [0, 1, 0]], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
+  print('Final result: ', m.result().numpy())  # Final result: 1.0
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.TopKCategoricalAccuracy()])
+  ```
+  """
+
+  def __init__(self, k=5, name='top_k_categorical_accuracy', dtype=None):
+    """Creates a `TopKCategoricalAccuracy` instance.
+
+    Args:
+      k: (Optional) Number of top elements to look at for computing accuracy.
+        Defaults to 5.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(TopKCategoricalAccuracy, self).__init__(
+        top_k_categorical_accuracy, name, dtype=dtype, k=k)
+
+  @classmethod
+  def from_config(cls, config):
+    if 'fn' in config:
+      config.pop('fn')
+    return super(TopKCategoricalAccuracy, cls).from_config(config)
+
+
+class SparseTopKCategoricalAccuracy(MeanMetricWrapper):
+  """Computes how often integer targets are in the top `K` predictions.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.SparseTopKCategoricalAccuracy()
+  m.update_state([2, 1], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
+  print('Final result: ', m.result().numpy())  # Final result: 1.0
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile(
+    'sgd',
+    metrics=[tf.keras.metrics.SparseTopKCategoricalAccuracy()])
+  ```
+  """
+
+  def __init__(self, k=5, name='sparse_top_k_categorical_accuracy', dtype=None):
+    """Creates a `SparseTopKCategoricalAccuracy` instance.
+
+    Args:
+      k: (Optional) Number of top elements to look at for computing accuracy.
+        Defaults to 5.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(SparseTopKCategoricalAccuracy, self).__init__(
+        sparse_top_k_categorical_accuracy, name, dtype=dtype, k=k)
+
+  @classmethod
+  def from_config(cls, config):
+    if 'fn' in config:
+      config.pop('fn')
+    return super(SparseTopKCategoricalAccuracy, cls).from_config(config)
+
+
 class _ConfusionMatrixConditionCount(Metric):
   """Calculates the number of the given confusion matrix condition."""
 
@@ -612,6 +770,7 @@ class _ConfusionMatrixConditionCount(Metric):
     """
     super(_ConfusionMatrixConditionCount, self).__init__(name=name, dtype=dtype)
     self._confusion_matrix_cond = confusion_matrix_cond
+    self.init_thresholds = thresholds
     self.thresholds = metrics_utils.parse_init_thresholds(
         thresholds, default_threshold=0.5)
     self.accumulator = self.add_weight(
@@ -648,6 +807,11 @@ class _ConfusionMatrixConditionCount(Metric):
     for v in self.variables:
       K.set_value(v, np.zeros((num_thresholds,)))
 
+  def get_config(self):
+    config = {'thresholds': self.init_thresholds}
+    base_config = super(_ConfusionMatrixConditionCount, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
 @keras_export('keras.metrics.FalsePositives')
 class FalsePositives(_ConfusionMatrixConditionCount):
@@ -894,6 +1058,7 @@ class Precision(Metric):
       dtype: (Optional) data type of the metric result.
     """
     super(Precision, self).__init__(name=name, dtype=dtype)
+    self.init_thresholds = thresholds
     self.thresholds = metrics_utils.parse_init_thresholds(
         thresholds, default_threshold=0.5)
     self.tp = self.add_weight(
@@ -932,6 +1097,11 @@ class Precision(Metric):
     for v in self.variables:
       K.set_value(v, np.zeros((num_thresholds,)))
 
+  def get_config(self):
+    config = {'thresholds': self.init_thresholds}
+    base_config = super(Precision, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
 @keras_export('keras.metrics.Recall')
 class Recall(Metric):
@@ -978,6 +1148,7 @@ class Recall(Metric):
       dtype: (Optional) data type of the metric result.
     """
     super(Recall, self).__init__(name=name, dtype=dtype)
+    self.init_thresholds = thresholds
     self.thresholds = metrics_utils.parse_init_thresholds(
         thresholds, default_threshold=0.5)
     self.tp = self.add_weight(
@@ -1016,6 +1187,11 @@ class Recall(Metric):
     for v in self.variables:
       K.set_value(v, np.zeros((num_thresholds,)))
 
+  def get_config(self):
+    config = {'thresholds': self.init_thresholds}
+    base_config = super(Recall, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
 @six.add_metaclass(abc.ABCMeta)
 class SensitivitySpecificityBase(Metric):
@@ -1132,6 +1308,8 @@ class SensitivityAtSpecificity(SensitivitySpecificityBase):
     """
     if specificity < 0 or specificity > 1:
       raise ValueError('`specificity` must be in the range [0, 1].')
+    self.specificity = specificity
+    self.num_thresholds = num_thresholds
     super(SensitivityAtSpecificity, self).__init__(
         specificity, num_thresholds=num_thresholds, name=name, dtype=dtype)
 
@@ -1149,6 +1327,14 @@ class SensitivityAtSpecificity(SensitivitySpecificityBase):
     return math_ops.div_no_nan(self.tp[min_index],
                                self.tp[min_index] + self.fn[min_index])
 
+  def get_config(self):
+    config = {
+        'num_thresholds': self.num_thresholds,
+        'specificity': self.specificity
+    }
+    base_config = super(SensitivityAtSpecificity, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
 @keras_export('keras.metrics.SpecificityAtSensitivity')
 class SpecificityAtSensitivity(SensitivitySpecificityBase):
@@ -1201,6 +1387,8 @@ class SpecificityAtSensitivity(SensitivitySpecificityBase):
     """
     if sensitivity < 0 or sensitivity > 1:
       raise ValueError('`sensitivity` must be in the range [0, 1].')
+    self.sensitivity = sensitivity
+    self.num_thresholds = num_thresholds
     super(SpecificityAtSensitivity, self).__init__(
         sensitivity, num_thresholds=num_thresholds, name=name, dtype=dtype)
 
@@ -1218,7 +1406,16 @@ class SpecificityAtSensitivity(SensitivitySpecificityBase):
     return math_ops.div_no_nan(self.tn[min_index],
                                self.tn[min_index] + self.fp[min_index])
 
+  def get_config(self):
+    config = {
+        'num_thresholds': self.num_thresholds,
+        'sensitivity': self.sensitivity
+    }
+    base_config = super(SpecificityAtSensitivity, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
 
+
+@keras_export('keras.metrics.CosineProximity')
 class CosineProximity(MeanMetricWrapper):
   """Computes the cosine distance between the labels and predictions.
 
@@ -1246,8 +1443,16 @@ class CosineProximity(MeanMetricWrapper):
   ```
   """
 
-  def __init__(self, name='cosine_proximity', dtype=None):
-    super(CosineProximity, self).__init__(cosine, name, dtype=dtype)
+  def __init__(self, name='cosine_proximity', dtype=None, axis=-1):
+    """Creates a `CosineProximity` instance.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      axis: (Optional) Defaults to -1. The dimension along which the cosine
+        proximity is computed.
+    """
+    super(CosineProximity, self).__init__(cosine, name, dtype=dtype, axis=axis)
 
   @classmethod
   def from_config(cls, config):
@@ -1256,6 +1461,7 @@ class CosineProximity(MeanMetricWrapper):
     return super(CosineProximity, cls).from_config(config)
 
 
+@keras_export('keras.metrics.MeanAbsoluteError')
 class MeanAbsoluteError(MeanMetricWrapper):
   """Computes the mean absolute error between the labels and predictions.
 
@@ -1264,8 +1470,8 @@ class MeanAbsoluteError(MeanMetricWrapper):
 
   Usage:
   ```python
-  mae = tf.metrics.MeanAbsoluteError()
-  mae.update_state([0., 0., 1., 1.], [1., 1., 1., 0.])
+  m = tf.metrics.MeanAbsoluteError()
+  m.update_state([0., 0., 1., 1.], [1., 1., 1., 0.])
   print('Final result: ', m.result().numpy())  # Final result: 0.75
   ```
 
@@ -1273,7 +1479,7 @@ class MeanAbsoluteError(MeanMetricWrapper):
 
   ```python
   model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.keras.losses.MeanAbsoluteError())
+  model.compile('sgd', metrics=[tf.keras.metrics.MeanAbsoluteError()])
   ```
   """
 
@@ -1288,6 +1494,7 @@ class MeanAbsoluteError(MeanMetricWrapper):
     return super(MeanAbsoluteError, cls).from_config(config)
 
 
+@keras_export('keras.metrics.MeanAbsolutePercentageError')
 class MeanAbsolutePercentageError(MeanMetricWrapper):
   """Computes the mean absolute percentage error between `y_true` and `y_pred`.
 
@@ -1297,8 +1504,8 @@ class MeanAbsolutePercentageError(MeanMetricWrapper):
   Usage:
 
   ```python
-  mape = tf.keras.losses.MeanAbsolutePercentageError()
-  mape.update_state([0., 0., 1., 1.], [1., 1., 1., 0.])
+  m = tf.keras.metrics.MeanAbsolutePercentageError()
+  m.update_state([0., 0., 1., 1.], [1., 1., 1., 0.])
   print('Final result: ', m.result().numpy())  # Final result: 5e+08
   ```
 
@@ -1306,7 +1513,7 @@ class MeanAbsolutePercentageError(MeanMetricWrapper):
 
   ```python
   model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.keras.losses.MeanAbsolutePercentageError())
+  model.compile('sgd', metrics=[tf.keras.metrics.MeanAbsolutePercentageError()])
   ```
   """
 
@@ -1321,6 +1528,7 @@ class MeanAbsolutePercentageError(MeanMetricWrapper):
     return super(MeanAbsolutePercentageError, cls).from_config(config)
 
 
+@keras_export('keras.metrics.MeanSquaredError')
 class MeanSquaredError(MeanMetricWrapper):
   """Computes the mean squared error between `y_true` and `y_pred`.
 
@@ -1330,8 +1538,8 @@ class MeanSquaredError(MeanMetricWrapper):
   Usage:
 
   ```python
-  mape = tf.keras.losses.MeanSquaredError()
-  mape.update_state([0., 0., 1., 1.], [1., 1., 1., 0.])
+  m = tf.keras.metrics.MeanSquaredError()
+  m.update_state([0., 0., 1., 1.], [1., 1., 1., 0.])
   print('Final result: ', m.result().numpy())  # Final result: 0.75
   ```
 
@@ -1339,7 +1547,7 @@ class MeanSquaredError(MeanMetricWrapper):
 
   ```python
   model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.keras.losses.MeanSquaredError())
+  model.compile('sgd', metrics=[tf.keras.metrics.MeanSquaredError()])
   ```
   """
 
@@ -1354,6 +1562,7 @@ class MeanSquaredError(MeanMetricWrapper):
     return super(MeanSquaredError, cls).from_config(config)
 
 
+@keras_export('keras.metrics.MeanSquaredLogarithmicError')
 class MeanSquaredLogarithmicError(MeanMetricWrapper):
   """Computes the mean squared logarithmic error between `y_true` and `y_pred`.
 
@@ -1363,8 +1572,8 @@ class MeanSquaredLogarithmicError(MeanMetricWrapper):
   Usage:
 
   ```python
-  msle = tf.keras.losses.MeanSquaredLogarithmicError()
-  msle.update_state([0., 0., 1., 1.], [1., 1., 1., 0.])
+  m = tf.keras.metrics.MeanSquaredLogarithmicError()
+  m.update_state([0., 0., 1., 1.], [1., 1., 1., 0.])
   print('Final result: ', m.result().numpy())  # Final result: 0.36034
   ```
 
@@ -1372,7 +1581,7 @@ class MeanSquaredLogarithmicError(MeanMetricWrapper):
 
   ```python
   model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.keras.losses.MeanSquaredLogarithmicError())
+  model.compile('sgd', metrics=[tf.keras.metrics.MeanSquaredLogarithmicError()])
   ```
   """
 
@@ -1387,6 +1596,7 @@ class MeanSquaredLogarithmicError(MeanMetricWrapper):
     return super(MeanSquaredLogarithmicError, cls).from_config(config)
 
 
+@keras_export('keras.metrics.Hinge')
 class Hinge(MeanMetricWrapper):
   """Computes the hinge metric between `y_true` and `y_pred`.
 
@@ -1396,8 +1606,8 @@ class Hinge(MeanMetricWrapper):
   Usage:
 
   ```python
-  h = tf.keras.metrics.Hinge()
-  h.update_state([0., 1., 1.], [1., 0., 1.])
+  m = tf.keras.metrics.Hinge()
+  m.update_state([0., 1., 1.], [1., 0., 1.])
   print('Final result: ', m.result().numpy())  # Final result: 0.66
   ```
 
@@ -1405,7 +1615,7 @@ class Hinge(MeanMetricWrapper):
 
   ```python
   model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.keras.metrics.Hinge())
+  model.compile('sgd', metrics=[tf.keras.metrics.Hinge()])
   ```
   """
 
@@ -1419,6 +1629,7 @@ class Hinge(MeanMetricWrapper):
     return super(Hinge, cls).from_config(config)
 
 
+@keras_export('keras.metrics.SquaredHinge')
 class SquaredHinge(MeanMetricWrapper):
   """Computes the squared hinge metric between `y_true` and `y_pred`.
 
@@ -1428,8 +1639,8 @@ class SquaredHinge(MeanMetricWrapper):
   Usage:
 
   ```python
-  h = tf.keras.metrics.SquaredHinge()
-  h.update_state([0., 1., 1.], [1., 0., 1.])
+  m = tf.keras.metrics.SquaredHinge()
+  m.update_state([0., 1., 1.], [1., 0., 1.])
   print('Final result: ', m.result().numpy())  # Final result: 0.66
   ```
 
@@ -1437,7 +1648,7 @@ class SquaredHinge(MeanMetricWrapper):
 
   ```python
   model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.keras.metrics.SquaredHinge())
+  model.compile('sgd', metrics=[tf.keras.metrics.SquaredHinge()])
   ```
   """
 
@@ -1451,6 +1662,7 @@ class SquaredHinge(MeanMetricWrapper):
     return super(SquaredHinge, cls).from_config(config)
 
 
+@keras_export('keras.metrics.CategoricalHinge')
 class CategoricalHinge(MeanMetricWrapper):
   """Computes the categorical hinge metric between `y_true` and `y_pred`.
 
@@ -1460,8 +1672,8 @@ class CategoricalHinge(MeanMetricWrapper):
   Usage:
 
   ```python
-  h = tf.keras.metrics.CategoricalHinge()
-  h.update_state([0., 1., 1.], [1., 0., 1.])
+  m = tf.keras.metrics.CategoricalHinge()
+  m.update_state([0., 1., 1.], [1., 0., 1.])
   print('Final result: ', m.result().numpy())  # Final result: 1.0
   ```
 
@@ -1469,7 +1681,7 @@ class CategoricalHinge(MeanMetricWrapper):
 
   ```python
   model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.keras.metrics.CategoricalHinge())
+  model.compile('sgd', metrics=[tf.keras.metrics.CategoricalHinge()])
   ```
   """
 
@@ -1483,6 +1695,147 @@ class CategoricalHinge(MeanMetricWrapper):
     return super(CategoricalHinge, cls).from_config(config)
 
 
+class RootMeanSquaredError(Mean):
+  """Computes root mean squared error metric between `y_true` and `y_pred`.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.RootMeanSquaredError()
+  m.update_state([2., 4., 6.], [1., 3., 2.])
+  print('Final result: ', m.result().numpy())  # Final result: 2.449
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.RootMeanSquaredError()])
+  ```
+  """
+
+  def __init__(self, name='root_mean_squared_error', dtype=None):
+    super(RootMeanSquaredError, self).__init__(name, dtype=dtype)
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates root mean squared error statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    y_true = math_ops.cast(y_true, self._dtype)
+    y_pred = math_ops.cast(y_pred, self._dtype)
+    y_pred, y_true, sample_weight = squeeze_or_expand_dimensions(
+        y_pred, y_true, sample_weight)
+    error_sq = math_ops.square(y_pred - y_true)
+    return super(RootMeanSquaredError, self).update_state(
+        error_sq, sample_weight=sample_weight)
+
+  def result(self):
+    return math_ops.sqrt(math_ops.div_no_nan(self.total, self.count))
+
+
+class Logcosh(MeanMetricWrapper):
+  """Computes the logarithm of the hyperbolic cosine of the prediction error.
+
+  logcosh = log((exp(x) + exp(-x))/2) where x is the error `y_pred` - `y_true`.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.Logcosh()
+  m.update_state([0., 1., 1.], [1., 0., 1.])
+  print('Final result: ', m.result().numpy())  # Final result: 0.289
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.Logcosh()])
+  ```
+  """
+
+  def __init__(self, name='logcosh', dtype=None):
+    super(Logcosh, self).__init__(logcosh, name, dtype=dtype)
+
+  @classmethod
+  def from_config(cls, config):
+    if 'fn' in config:
+      config.pop('fn')
+    return super(Logcosh, cls).from_config(config)
+
+
+class Poisson(MeanMetricWrapper):
+  """Computes the poisson metric between `y_true` and `y_pred`.
+
+  metric = y_pred - y_true * log(y_pred)
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.Poisson()
+  m.update_state([1, 9, 2], [4, 8, 12])
+  print('Final result: ', m.result().numpy())  # Final result: -4.63
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.Poisson()])
+  ```
+  """
+
+  def __init__(self, name='poisson', dtype=None):
+    super(Poisson, self).__init__(poisson, name, dtype=dtype)
+
+  @classmethod
+  def from_config(cls, config):
+    if 'fn' in config:
+      config.pop('fn')
+    return super(Poisson, cls).from_config(config)
+
+
+class KullbackLeiblerDivergence(MeanMetricWrapper):
+  """Computes kullback leibler divergence metric between `y_true` and `y_pred`.
+
+  metric = y_true * log(y_true / y_pred)
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.KullbackLeiblerDivergence()
+  m.update_state([.4, .9, .2], [.5, .8, .12])
+  print('Final result: ', m.result().numpy())  # Final result: -0.043
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.KullbackLeiblerDivergence()])
+  ```
+  """
+
+  def __init__(self, name='kullback_leibler_divergence', dtype=None):
+    super(KullbackLeiblerDivergence, self).__init__(
+        kullback_leibler_divergence, name, dtype=dtype)
+
+  @classmethod
+  def from_config(cls, config):
+    if 'fn' in config:
+      config.pop('fn')
+    return super(KullbackLeiblerDivergence, cls).from_config(config)
+
+
 def accuracy(y_true, y_pred):
   y_pred.get_shape().assert_is_compatible_with(y_true.get_shape())
   if y_true.dtype != y_pred.dtype:
@@ -1507,8 +1860,11 @@ def categorical_accuracy(y_true, y_pred):
 
 @keras_export('keras.metrics.sparse_categorical_accuracy')
 def sparse_categorical_accuracy(y_true, y_pred):
+  y_pred_rank = ops.convert_to_tensor(y_pred).get_shape().ndims
+  y_true_rank = ops.convert_to_tensor(y_true).get_shape().ndims
   # If the shape of y_true is (num_samples, 1), squeeze to (num_samples,)
-  if (len(K.int_shape(y_true)) == len(K.int_shape(y_pred))):
+  if (y_true_rank is not None) and (y_pred_rank is not None) and (len(
+      K.int_shape(y_true)) == len(K.int_shape(y_pred))):
     y_true = array_ops.squeeze(y_true, [-1])
   y_pred = math_ops.argmax(y_pred, axis=-1)
 
@@ -1528,8 +1884,11 @@ def top_k_categorical_accuracy(y_true, y_pred, k=5):
 
 @keras_export('keras.metrics.sparse_top_k_categorical_accuracy')
 def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
+  y_pred_rank = ops.convert_to_tensor(y_pred).get_shape().ndims
+  y_true_rank = ops.convert_to_tensor(y_true).get_shape().ndims
   # If the shape of y_true is (num_samples, 1), squeeze to (num_samples,)
-  if (len(K.int_shape(y_true)) == len(K.int_shape(y_pred))):
+  if (y_true_rank is not None) and (y_pred_rank is not None) and (len(
+      K.int_shape(y_true)) == len(K.int_shape(y_pred))):
     y_true = array_ops.squeeze(y_true, [-1])
 
   return K.mean(nn.in_top_k(y_pred, math_ops.cast(y_true, 'int32'), k), axis=-1)
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index a6d714dcfb4ac1dd57e2a47254f107137a5279ae..42da1dfb99344cfa87fd443c10eba2f57a236ea4 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import math
 import os
 from absl.testing import parameterized
 import numpy as np
@@ -76,6 +77,13 @@ class KerasMeanTest(test.TestCase):
     self.assertEqual(self.evaluate(m.total), 0)
     self.assertEqual(self.evaluate(m.count), 0)
 
+    # Check save and restore config
+    m2 = metrics.Mean.from_config(m.get_config())
+    self.assertEqual(m2.name, 'my_mean')
+    self.assertTrue(m2.stateful)
+    self.assertEqual(m2.dtype, dtypes.float32)
+    self.assertEqual(len(m2.variables), 2)
+
   def test_mean_with_sample_weight(self):
     m = metrics.Mean(dtype=dtypes.float64)
     self.assertEqual(m.dtype, dtypes.float64)
@@ -189,6 +197,13 @@ class KerasAccuracyTest(test.TestCase):
     result = self.evaluate(acc_obj.result())
     self.assertEqual(result, 1)  # 2/2
 
+    # Check save and restore config
+    a2 = metrics.Accuracy.from_config(acc_obj.get_config())
+    self.assertEqual(a2.name, 'my acc')
+    self.assertTrue(a2.stateful)
+    self.assertEqual(len(a2.variables), 2)
+    self.assertEqual(a2.dtype, dtypes.float32)
+
     # check with sample_weight
     result_t = acc_obj([[2], [1]], [[2], [0]], sample_weight=[[0.5], [0.2]])
     result = self.evaluate(result_t)
@@ -279,6 +294,47 @@ class KerasAccuracyTest(test.TestCase):
     result = self.evaluate(result_t)
     self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
 
+  def test_sparse_categorical_accuracy_mismatched_dims(self):
+    acc_obj = metrics.SparseCategoricalAccuracy(name='my acc')
+
+    # check config
+    self.assertEqual(acc_obj.name, 'my acc')
+    self.assertTrue(acc_obj.stateful)
+    self.assertEqual(len(acc_obj.variables), 2)
+    self.assertEqual(acc_obj.dtype, dtypes.float32)
+    self.evaluate(variables.variables_initializer(acc_obj.variables))
+
+    # verify that correct value is returned
+    update_op = acc_obj.update_state([2, 1], [[0.1, 0.1, 0.8], [0.05, 0.95, 0]])
+    self.evaluate(update_op)
+    result = self.evaluate(acc_obj.result())
+    self.assertEqual(result, 1)  # 2/2
+
+    # check with sample_weight
+    result_t = acc_obj([2, 1], [[0.1, 0.1, 0.8], [0.05, 0, 0.95]],
+                       [[0.5], [0.2]])
+    result = self.evaluate(result_t)
+    self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
+
+  def test_sparse_categorical_accuracy_mismatched_dims_dynamic(self):
+    with context.graph_mode(), self.cached_session() as sess:
+      acc_obj = metrics.SparseCategoricalAccuracy(name='my acc')
+      self.evaluate(variables.variables_initializer(acc_obj.variables))
+
+      t = array_ops.placeholder(dtypes.float32)
+      p = array_ops.placeholder(dtypes.float32)
+      w = array_ops.placeholder(dtypes.float32)
+
+      result_t = acc_obj(t, p, w)
+      result = sess.run(
+          result_t,
+          feed_dict=({
+              t: [2, 1],
+              p: [[0.1, 0.1, 0.8], [0.05, 0, 0.95]],
+              w: [[0.5], [0.2]]
+          }))
+      self.assertAlmostEqual(result, 0.71, 2)  # 2.5/2.7
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class FalsePositivesTest(test.TestCase):
@@ -289,6 +345,12 @@ class FalsePositivesTest(test.TestCase):
     self.assertEqual(len(fp_obj.variables), 1)
     self.assertEqual(fp_obj.thresholds, [0.4, 0.9])
 
+    # Check save and restore config
+    fp_obj2 = metrics.FalsePositives.from_config(fp_obj.get_config())
+    self.assertEqual(fp_obj2.name, 'my_fp')
+    self.assertEqual(len(fp_obj2.variables), 1)
+    self.assertEqual(fp_obj2.thresholds, [0.4, 0.9])
+
   def test_unweighted(self):
     fp_obj = metrics.FalsePositives()
     self.evaluate(variables.variables_initializer(fp_obj.variables))
@@ -363,6 +425,12 @@ class FalseNegativesTest(test.TestCase):
     self.assertEqual(len(fn_obj.variables), 1)
     self.assertEqual(fn_obj.thresholds, [0.4, 0.9])
 
+    # Check save and restore config
+    fn_obj2 = metrics.FalseNegatives.from_config(fn_obj.get_config())
+    self.assertEqual(fn_obj2.name, 'my_fn')
+    self.assertEqual(len(fn_obj2.variables), 1)
+    self.assertEqual(fn_obj2.thresholds, [0.4, 0.9])
+
   def test_unweighted(self):
     fn_obj = metrics.FalseNegatives()
     self.evaluate(variables.variables_initializer(fn_obj.variables))
@@ -425,6 +493,12 @@ class TrueNegativesTest(test.TestCase):
     self.assertEqual(len(tn_obj.variables), 1)
     self.assertEqual(tn_obj.thresholds, [0.4, 0.9])
 
+    # Check save and restore config
+    tn_obj2 = metrics.TrueNegatives.from_config(tn_obj.get_config())
+    self.assertEqual(tn_obj2.name, 'my_tn')
+    self.assertEqual(len(tn_obj2.variables), 1)
+    self.assertEqual(tn_obj2.thresholds, [0.4, 0.9])
+
   def test_unweighted(self):
     tn_obj = metrics.TrueNegatives()
     self.evaluate(variables.variables_initializer(tn_obj.variables))
@@ -487,6 +561,12 @@ class TruePositivesTest(test.TestCase):
     self.assertEqual(len(tp_obj.variables), 1)
     self.assertEqual(tp_obj.thresholds, [0.4, 0.9])
 
+    # Check save and restore config
+    tp_obj2 = metrics.TruePositives.from_config(tp_obj.get_config())
+    self.assertEqual(tp_obj2.name, 'my_tp')
+    self.assertEqual(len(tp_obj2.variables), 1)
+    self.assertEqual(tp_obj2.thresholds, [0.4, 0.9])
+
   def test_unweighted(self):
     tp_obj = metrics.TruePositives()
     self.evaluate(variables.variables_initializer(tp_obj.variables))
@@ -550,6 +630,12 @@ class PrecisionTest(test.TestCase):
                      ['true_positives:0', 'false_positives:0'])
     self.assertEqual(p_obj.thresholds, [0.4, 0.9])
 
+    # Check save and restore config
+    p_obj2 = metrics.Precision.from_config(p_obj.get_config())
+    self.assertEqual(p_obj2.name, 'my_precision')
+    self.assertEqual(len(p_obj2.variables), 2)
+    self.assertEqual(p_obj2.thresholds, [0.4, 0.9])
+
   def test_value_is_idempotent(self):
     p_obj = metrics.Precision(thresholds=[0.3, 0.72])
     y_pred = random_ops.random_uniform(shape=(10, 3))
@@ -662,6 +748,12 @@ class RecallTest(test.TestCase):
                      ['true_positives:0', 'false_negatives:0'])
     self.assertEqual(r_obj.thresholds, [0.4, 0.9])
 
+    # Check save and restore config
+    r_obj2 = metrics.Recall.from_config(r_obj.get_config())
+    self.assertEqual(r_obj2.name, 'my_recall')
+    self.assertEqual(len(r_obj2.variables), 2)
+    self.assertEqual(r_obj2.thresholds, [0.4, 0.9])
+
   def test_value_is_idempotent(self):
     r_obj = metrics.Recall(thresholds=[0.3, 0.72])
     y_pred = random_ops.random_uniform(shape=(10, 3))
@@ -770,8 +862,15 @@ class SensitivityAtSpecificityTest(test.TestCase, parameterized.TestCase):
         0.4, num_thresholds=100, name='sensitivity_at_specificity_1')
     self.assertEqual(s_obj.name, 'sensitivity_at_specificity_1')
     self.assertLen(s_obj.variables, 4)
-    self.assertEqual(s_obj.value, 0.4)
-    self.assertLen(s_obj.thresholds, 100)
+    self.assertEqual(s_obj.specificity, 0.4)
+    self.assertEqual(s_obj.num_thresholds, 100)
+
+    # Check save and restore config
+    s_obj2 = metrics.SensitivityAtSpecificity.from_config(s_obj.get_config())
+    self.assertEqual(s_obj2.name, 'sensitivity_at_specificity_1')
+    self.assertLen(s_obj2.variables, 4)
+    self.assertEqual(s_obj2.specificity, 0.4)
+    self.assertEqual(s_obj2.num_thresholds, 100)
 
   def test_value_is_idempotent(self):
     s_obj = metrics.SensitivityAtSpecificity(0.7)
@@ -859,8 +958,15 @@ class SpecificityAtSensitivityTest(test.TestCase, parameterized.TestCase):
         0.4, num_thresholds=100, name='specificity_at_sensitivity_1')
     self.assertEqual(s_obj.name, 'specificity_at_sensitivity_1')
     self.assertLen(s_obj.variables, 4)
-    self.assertEqual(s_obj.value, 0.4)
-    self.assertLen(s_obj.thresholds, 100)
+    self.assertEqual(s_obj.sensitivity, 0.4)
+    self.assertEqual(s_obj.num_thresholds, 100)
+
+    # Check save and restore config
+    s_obj2 = metrics.SpecificityAtSensitivity.from_config(s_obj.get_config())
+    self.assertEqual(s_obj2.name, 'specificity_at_sensitivity_1')
+    self.assertLen(s_obj2.variables, 4)
+    self.assertEqual(s_obj2.sensitivity, 0.4)
+    self.assertEqual(s_obj2.num_thresholds, 100)
 
   def test_value_is_idempotent(self):
     s_obj = metrics.SpecificityAtSensitivity(0.7)
@@ -943,35 +1049,62 @@ class SpecificityAtSensitivityTest(test.TestCase, parameterized.TestCase):
 @test_util.run_all_in_graph_and_eager_modes
 class CosineProximityTest(test.TestCase):
 
+  def l2_norm(self, x, axis):
+    epsilon = 1e-12
+    square_sum = np.sum(np.square(x), axis=axis, keepdims=True)
+    x_inv_norm = 1 / np.sqrt(np.maximum(square_sum, epsilon))
+    return np.multiply(x, x_inv_norm)
+
+  def setup(self, axis=1):
+    self.np_y_true = np.asarray([[1, 9, 2], [-5, -2, 6]], dtype=np.float32)
+    self.np_y_pred = np.asarray([[4, 8, 12], [8, 1, 3]], dtype=np.float32)
+
+    y_true = self.l2_norm(self.np_y_true, axis)
+    y_pred = self.l2_norm(self.np_y_pred, axis)
+    self.expected_loss = -np.sum(np.multiply(y_true, y_pred), axis=(axis,))
+
+    self.y_true = constant_op.constant(self.np_y_true)
+    self.y_pred = constant_op.constant(self.np_y_pred)
+
   def test_config(self):
-    cosine_obj = metrics.CosineProximity(name='my_cos', dtype=dtypes.int32)
+    cosine_obj = metrics.CosineProximity(
+        axis=2, name='my_cos', dtype=dtypes.int32)
     self.assertEqual(cosine_obj.name, 'my_cos')
     self.assertEqual(cosine_obj._dtype, dtypes.int32)
 
+    # Check save and restore config
+    cosine_obj2 = metrics.CosineProximity.from_config(cosine_obj.get_config())
+    self.assertEqual(cosine_obj2.name, 'my_cos')
+    self.assertEqual(cosine_obj2._dtype, dtypes.int32)
+
   def test_unweighted(self):
+    self.setup()
     cosine_obj = metrics.CosineProximity()
     self.evaluate(variables.variables_initializer(cosine_obj.variables))
-
-    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
-
-    update_op = cosine_obj.update_state(y_true, y_pred)
-    self.evaluate(update_op)
-    result = cosine_obj.result()
-    self.assertAllClose(-0.60723, result, atol=1e-5)
+    loss = cosine_obj(self.y_true, self.y_pred)
+    expected_loss = np.mean(self.expected_loss)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
 
   def test_weighted(self):
+    self.setup()
     cosine_obj = metrics.CosineProximity()
     self.evaluate(variables.variables_initializer(cosine_obj.variables))
-    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
-    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
-    result = cosine_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(-0.59916, self.evaluate(result), atol=1e-5)
+    sample_weight = np.asarray([1.2, 3.4])
+    loss = cosine_obj(
+        self.y_true,
+        self.y_pred,
+        sample_weight=constant_op.constant(sample_weight))
+    expected_loss = np.sum(
+        self.expected_loss * sample_weight) / np.sum(sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_axis(self):
+    self.setup(axis=1)
+    cosine_obj = metrics.CosineProximity(axis=1)
+    self.evaluate(variables.variables_initializer(cosine_obj.variables))
+    loss = cosine_obj(self.y_true, self.y_pred)
+    expected_loss = np.mean(self.expected_loss)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -982,6 +1115,11 @@ class MeanAbsoluteErrorTest(test.TestCase):
     self.assertEqual(mae_obj.name, 'my_mae')
     self.assertEqual(mae_obj._dtype, dtypes.int32)
 
+    # Check save and restore config
+    mae_obj2 = metrics.MeanAbsoluteError.from_config(mae_obj.get_config())
+    self.assertEqual(mae_obj2.name, 'my_mae')
+    self.assertEqual(mae_obj2._dtype, dtypes.int32)
+
   def test_unweighted(self):
     mae_obj = metrics.MeanAbsoluteError()
     self.evaluate(variables.variables_initializer(mae_obj.variables))
@@ -1016,6 +1154,12 @@ class MeanAbsolutePercentageErrorTest(test.TestCase):
     self.assertEqual(mape_obj.name, 'my_mape')
     self.assertEqual(mape_obj._dtype, dtypes.int32)
 
+    # Check save and restore config
+    mape_obj2 = metrics.MeanAbsolutePercentageError.from_config(
+        mape_obj.get_config())
+    self.assertEqual(mape_obj2.name, 'my_mape')
+    self.assertEqual(mape_obj2._dtype, dtypes.int32)
+
   def test_unweighted(self):
     mape_obj = metrics.MeanAbsolutePercentageError()
     self.evaluate(variables.variables_initializer(mape_obj.variables))
@@ -1049,6 +1193,11 @@ class MeanSquaredErrorTest(test.TestCase):
     self.assertEqual(mse_obj.name, 'my_mse')
     self.assertEqual(mse_obj._dtype, dtypes.int32)
 
+    # Check save and restore config
+    mse_obj2 = metrics.MeanSquaredError.from_config(mse_obj.get_config())
+    self.assertEqual(mse_obj2.name, 'my_mse')
+    self.assertEqual(mse_obj2._dtype, dtypes.int32)
+
   def test_unweighted(self):
     mse_obj = metrics.MeanSquaredError()
     self.evaluate(variables.variables_initializer(mse_obj.variables))
@@ -1083,6 +1232,12 @@ class MeanSquaredLogarithmicErrorTest(test.TestCase):
     self.assertEqual(msle_obj.name, 'my_msle')
     self.assertEqual(msle_obj._dtype, dtypes.int32)
 
+    # Check save and restore config
+    msle_obj2 = metrics.MeanSquaredLogarithmicError.from_config(
+        msle_obj.get_config())
+    self.assertEqual(msle_obj2.name, 'my_msle')
+    self.assertEqual(msle_obj2._dtype, dtypes.int32)
+
   def test_unweighted(self):
     msle_obj = metrics.MeanSquaredLogarithmicError()
     self.evaluate(variables.variables_initializer(msle_obj.variables))
@@ -1116,6 +1271,11 @@ class HingeTest(test.TestCase):
     self.assertEqual(hinge_obj.name, 'hinge')
     self.assertEqual(hinge_obj._dtype, dtypes.int32)
 
+    # Check save and restore config
+    hinge_obj2 = metrics.Hinge.from_config(hinge_obj.get_config())
+    self.assertEqual(hinge_obj2.name, 'hinge')
+    self.assertEqual(hinge_obj2._dtype, dtypes.int32)
+
   def test_unweighted(self):
     hinge_obj = metrics.Hinge()
     self.evaluate(variables.variables_initializer(hinge_obj.variables))
@@ -1149,6 +1309,11 @@ class SquaredHingeTest(test.TestCase):
     self.assertEqual(sq_hinge_obj.name, 'sq_hinge')
     self.assertEqual(sq_hinge_obj._dtype, dtypes.int32)
 
+    # Check save and restore config
+    sq_hinge_obj2 = metrics.SquaredHinge.from_config(sq_hinge_obj.get_config())
+    self.assertEqual(sq_hinge_obj2.name, 'sq_hinge')
+    self.assertEqual(sq_hinge_obj2._dtype, dtypes.int32)
+
   def test_unweighted(self):
     sq_hinge_obj = metrics.SquaredHinge()
     self.evaluate(variables.variables_initializer(sq_hinge_obj.variables))
@@ -1183,6 +1348,12 @@ class CategoricalHingeTest(test.TestCase):
     self.assertEqual(cat_hinge_obj.name, 'cat_hinge')
     self.assertEqual(cat_hinge_obj._dtype, dtypes.int32)
 
+    # Check save and restore config
+    cat_hinge_obj2 = metrics.CategoricalHinge.from_config(
+        cat_hinge_obj.get_config())
+    self.assertEqual(cat_hinge_obj2.name, 'cat_hinge')
+    self.assertEqual(cat_hinge_obj2._dtype, dtypes.int32)
+
   def test_unweighted(self):
     cat_hinge_obj = metrics.CategoricalHinge()
     self.evaluate(variables.variables_initializer(cat_hinge_obj.variables))
@@ -1208,6 +1379,308 @@ class CategoricalHingeTest(test.TestCase):
     self.assertAllClose(0.5, self.evaluate(result), atol=1e-5)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class RootMeanSquaredErrorTest(test.TestCase):
+
+  def test_config(self):
+    rmse_obj = metrics.RootMeanSquaredError(name='rmse', dtype=dtypes.int32)
+    self.assertEqual(rmse_obj.name, 'rmse')
+    self.assertEqual(rmse_obj._dtype, dtypes.int32)
+
+    rmse_obj2 = metrics.RootMeanSquaredError.from_config(rmse_obj.get_config())
+    self.assertEqual(rmse_obj2.name, 'rmse')
+    self.assertEqual(rmse_obj2._dtype, dtypes.int32)
+
+  def test_unweighted(self):
+    rmse_obj = metrics.RootMeanSquaredError()
+    self.evaluate(variables.variables_initializer(rmse_obj.variables))
+    y_true = constant_op.constant((2, 4, 6))
+    y_pred = constant_op.constant((1, 3, 2))
+
+    update_op = rmse_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = rmse_obj.result()
+    # error = [-1, -1, -4], square(error) = [1, 1, 16], mean = 18/3 = 6
+    self.assertAllClose(math.sqrt(6), result, atol=1e-3)
+
+  def test_weighted(self):
+    rmse_obj = metrics.RootMeanSquaredError()
+    self.evaluate(variables.variables_initializer(rmse_obj.variables))
+    y_true = constant_op.constant((2, 4, 6, 8))
+    y_pred = constant_op.constant((1, 3, 2, 3))
+    sample_weight = constant_op.constant((0, 1, 0, 1))
+    result = rmse_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(math.sqrt(13), self.evaluate(result), atol=1e-3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TopKCategoricalAccuracyTest(test.TestCase):
+
+  def test_config(self):
+    a_obj = metrics.TopKCategoricalAccuracy(name='topkca', dtype=dtypes.int32)
+    self.assertEqual(a_obj.name, 'topkca')
+    self.assertEqual(a_obj._dtype, dtypes.int32)
+
+    a_obj2 = metrics.TopKCategoricalAccuracy.from_config(a_obj.get_config())
+    self.assertEqual(a_obj2.name, 'topkca')
+    self.assertEqual(a_obj2._dtype, dtypes.int32)
+
+  def test_correctness(self):
+    a_obj = metrics.TopKCategoricalAccuracy()
+    self.evaluate(variables.variables_initializer(a_obj.variables))
+    y_true = constant_op.constant([[0, 0, 1], [0, 1, 0]])
+    y_pred = constant_op.constant([[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
+
+    result = a_obj(y_true, y_pred)
+    self.assertEqual(1, self.evaluate(result))  # both the samples match
+
+    # With `k` < 5.
+    a_obj = metrics.TopKCategoricalAccuracy(k=1)
+    self.evaluate(variables.variables_initializer(a_obj.variables))
+    result = a_obj(y_true, y_pred)
+    self.assertEqual(0.5, self.evaluate(result))  # only sample #2 matches
+
+    # With `k` > 5.
+    y_true = constant_op.constant([[0, 0, 1, 0, 0, 0, 0],
+                                   [0, 1, 0, 0, 0, 0, 0]])
+    y_pred = constant_op.constant([[0.5, 0.9, 0.1, 0.7, 0.6, 0.5, 0.4],
+                                   [0.05, 0.95, 0, 0, 0, 0, 0]])
+    a_obj = metrics.TopKCategoricalAccuracy(k=6)
+    self.evaluate(variables.variables_initializer(a_obj.variables))
+    result = a_obj(y_true, y_pred)
+    self.assertEqual(0.5, self.evaluate(result))  # only 1 sample matches.
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SparseTopKCategoricalAccuracyTest(test.TestCase):
+
+  def test_config(self):
+    a_obj = metrics.SparseTopKCategoricalAccuracy(
+        name='stopkca', dtype=dtypes.int32)
+    self.assertEqual(a_obj.name, 'stopkca')
+    self.assertEqual(a_obj._dtype, dtypes.int32)
+
+    a_obj2 = metrics.SparseTopKCategoricalAccuracy.from_config(
+        a_obj.get_config())
+    self.assertEqual(a_obj2.name, 'stopkca')
+    self.assertEqual(a_obj2._dtype, dtypes.int32)
+
+  def test_correctness(self):
+    a_obj = metrics.SparseTopKCategoricalAccuracy()
+    self.evaluate(variables.variables_initializer(a_obj.variables))
+    y_true = constant_op.constant([2, 1])
+    y_pred = constant_op.constant([[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
+
+    result = a_obj(y_true, y_pred)
+    self.assertEqual(1, self.evaluate(result))  # both the samples match
+
+    # With `k` < 5.
+    a_obj = metrics.SparseTopKCategoricalAccuracy(k=1)
+    self.evaluate(variables.variables_initializer(a_obj.variables))
+    result = a_obj(y_true, y_pred)
+    self.assertEqual(0.5, self.evaluate(result))  # only sample #2 matches
+
+    # With `k` > 5.
+    y_pred = constant_op.constant([[0.5, 0.9, 0.1, 0.7, 0.6, 0.5, 0.4],
+                                   [0.05, 0.95, 0, 0, 0, 0, 0]])
+    a_obj = metrics.SparseTopKCategoricalAccuracy(k=6)
+    self.evaluate(variables.variables_initializer(a_obj.variables))
+    result = a_obj(y_true, y_pred)
+    self.assertEqual(0.5, self.evaluate(result))  # only 1 sample matches.
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class LogcoshTest(test.TestCase):
+
+  def setup(self):
+    y_pred = np.asarray([1, 9, 2, -5, -2, 6]).reshape((2, 3))
+    y_true = np.asarray([4, 8, 12, 8, 1, 3]).reshape((2, 3))
+
+    self.batch_size = 6
+    error = y_pred - y_true
+    self.expected_results = np.log((np.exp(error) + np.exp(-error)) / 2)
+
+    self.y_pred = constant_op.constant(y_pred, dtype=dtypes.float32)
+    self.y_true = constant_op.constant(y_true)
+
+  def test_config(self):
+    logcosh_obj = metrics.Logcosh(name='logcosh', dtype=dtypes.int32)
+    self.assertEqual(logcosh_obj.name, 'logcosh')
+    self.assertEqual(logcosh_obj._dtype, dtypes.int32)
+
+  def test_unweighted(self):
+    self.setup()
+    logcosh_obj = metrics.Logcosh()
+    self.evaluate(variables.variables_initializer(logcosh_obj.variables))
+
+    update_op = logcosh_obj.update_state(self.y_true, self.y_pred)
+    self.evaluate(update_op)
+    result = logcosh_obj.result()
+    expected_result = np.sum(self.expected_results) / self.batch_size
+    self.assertAllClose(result, expected_result, atol=1e-3)
+
+  def test_weighted(self):
+    self.setup()
+    logcosh_obj = metrics.Logcosh()
+    self.evaluate(variables.variables_initializer(logcosh_obj.variables))
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    result = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+
+    sample_weight = np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3))
+    expected_result = np.multiply(self.expected_results, sample_weight)
+    expected_result = np.sum(expected_result) / np.sum(sample_weight)
+    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class PoissonTest(test.TestCase):
+
+  def setup(self):
+    y_pred = np.asarray([1, 9, 2, 5, 2, 6]).reshape((2, 3))
+    y_true = np.asarray([4, 8, 12, 8, 1, 3]).reshape((2, 3))
+
+    self.batch_size = 6
+    self.expected_results = y_pred - np.multiply(y_true, np.log(y_pred))
+
+    self.y_pred = constant_op.constant(y_pred, dtype=dtypes.float32)
+    self.y_true = constant_op.constant(y_true)
+
+  def test_config(self):
+    poisson_obj = metrics.Poisson(name='poisson', dtype=dtypes.int32)
+    self.assertEqual(poisson_obj.name, 'poisson')
+    self.assertEqual(poisson_obj._dtype, dtypes.int32)
+
+    poisson_obj2 = metrics.Poisson.from_config(poisson_obj.get_config())
+    self.assertEqual(poisson_obj2.name, 'poisson')
+    self.assertEqual(poisson_obj2._dtype, dtypes.int32)
+
+  def test_unweighted(self):
+    self.setup()
+    poisson_obj = metrics.Poisson()
+    self.evaluate(variables.variables_initializer(poisson_obj.variables))
+
+    update_op = poisson_obj.update_state(self.y_true, self.y_pred)
+    self.evaluate(update_op)
+    result = poisson_obj.result()
+    expected_result = np.sum(self.expected_results) / self.batch_size
+    self.assertAllClose(result, expected_result, atol=1e-3)
+
+  def test_weighted(self):
+    self.setup()
+    poisson_obj = metrics.Poisson()
+    self.evaluate(variables.variables_initializer(poisson_obj.variables))
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+
+    result = poisson_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    sample_weight = np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3))
+    expected_result = np.multiply(self.expected_results, sample_weight)
+    expected_result = np.sum(expected_result) / np.sum(sample_weight)
+    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class KullbackLeiblerDivergenceTest(test.TestCase):
+
+  def setup(self):
+    y_pred = np.asarray([.4, .9, .12, .36, .3, .4]).reshape((2, 3))
+    y_true = np.asarray([.5, .8, .12, .7, .43, .8]).reshape((2, 3))
+
+    self.batch_size = 2
+    self.expected_results = np.multiply(y_true, np.log(y_true / y_pred))
+
+    self.y_pred = constant_op.constant(y_pred, dtype=dtypes.float32)
+    self.y_true = constant_op.constant(y_true)
+
+  def test_config(self):
+    k_obj = metrics.KullbackLeiblerDivergence(name='kld', dtype=dtypes.int32)
+    self.assertEqual(k_obj.name, 'kld')
+    self.assertEqual(k_obj._dtype, dtypes.int32)
+
+    k_obj2 = metrics.KullbackLeiblerDivergence.from_config(k_obj.get_config())
+    self.assertEqual(k_obj2.name, 'kld')
+    self.assertEqual(k_obj2._dtype, dtypes.int32)
+
+  def test_unweighted(self):
+    self.setup()
+    k_obj = metrics.KullbackLeiblerDivergence()
+    self.evaluate(variables.variables_initializer(k_obj.variables))
+
+    update_op = k_obj.update_state(self.y_true, self.y_pred)
+    self.evaluate(update_op)
+    result = k_obj.result()
+    expected_result = np.sum(self.expected_results) / self.batch_size
+    self.assertAllClose(result, expected_result, atol=1e-3)
+
+  def test_weighted(self):
+    self.setup()
+    k_obj = metrics.KullbackLeiblerDivergence()
+    self.evaluate(variables.variables_initializer(k_obj.variables))
+
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    result = k_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+
+    sample_weight = np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3))
+    expected_result = np.multiply(self.expected_results, sample_weight)
+    expected_result = np.sum(expected_result) / (1.2 + 3.4)
+    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MeanRelativeErrorTest(test.TestCase):
+
+  def test_config(self):
+    normalizer = constant_op.constant([1, 3], dtype=dtypes.float32)
+    mre_obj = metrics.MeanRelativeError(normalizer=normalizer, name='mre')
+    self.assertEqual(mre_obj.name, 'mre')
+    self.assertArrayNear(self.evaluate(mre_obj.normalizer), [1, 3], 1e-1)
+
+    mre_obj2 = metrics.MeanRelativeError.from_config(mre_obj.get_config())
+    self.assertEqual(mre_obj2.name, 'mre')
+    self.assertArrayNear(self.evaluate(mre_obj2.normalizer), [1, 3], 1e-1)
+
+  def test_unweighted(self):
+    np_y_pred = np.asarray([2, 4, 6, 8], dtype=np.float32)
+    np_y_true = np.asarray([1, 3, 2, 3], dtype=np.float32)
+    expected_error = np.mean(
+        np.divide(np.absolute(np_y_pred - np_y_true), np_y_true))
+
+    y_pred = constant_op.constant(np_y_pred, shape=(1, 4), dtype=dtypes.float32)
+    y_true = constant_op.constant(np_y_true, shape=(1, 4))
+
+    mre_obj = metrics.MeanRelativeError(normalizer=y_true)
+    self.evaluate(variables.variables_initializer(mre_obj.variables))
+
+    result = mre_obj(y_true, y_pred)
+    self.assertAllClose(self.evaluate(result), expected_error, atol=1e-3)
+
+  def test_weighted(self):
+    np_y_pred = np.asarray([2, 4, 6, 8], dtype=np.float32)
+    np_y_true = np.asarray([1, 3, 2, 3], dtype=np.float32)
+    sample_weight = np.asarray([0.2, 0.3, 0.5, 0], dtype=np.float32)
+    rel_errors = np.divide(np.absolute(np_y_pred - np_y_true), np_y_true)
+    expected_error = np.sum(rel_errors * sample_weight)
+
+    y_pred = constant_op.constant(np_y_pred, dtype=dtypes.float32)
+    y_true = constant_op.constant(np_y_true)
+
+    mre_obj = metrics.MeanRelativeError(normalizer=y_true)
+    self.evaluate(variables.variables_initializer(mre_obj.variables))
+
+    result = mre_obj(
+        y_true, y_pred, sample_weight=constant_op.constant(sample_weight))
+    self.assertAllClose(self.evaluate(result), expected_error, atol=1e-3)
+
+  def test_zero_normalizer(self):
+    y_pred = constant_op.constant([2, 4], dtype=dtypes.float32)
+    y_true = constant_op.constant([1, 3])
+
+    mre_obj = metrics.MeanRelativeError(normalizer=array_ops.zeros_like(y_true))
+    self.evaluate(variables.variables_initializer(mre_obj.variables))
+
+    result = mre_obj(y_true, y_pred)
+    self.assertEqual(self.evaluate(result), 0)
+
+
 def _get_model(compile_metrics):
   model_layers = [
       layers.Dense(3, activation='relu', kernel_initializer='ones'),
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index f36c89041c2553a446ee54d3e74e60e00333704a..851de6e71fa63e15a0a23a5ede6c7e63c4eb5fc5 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -281,8 +281,6 @@ def clone_model(model, input_tensors=None):
 
 
 # "Clone" a subclassed model by reseting all of the attributes.
-
-
 def _in_place_subclassed_model_reset(model):
   """Substitute for model cloning that works for subclassed models.
 
@@ -382,11 +380,30 @@ def _in_place_subclassed_model_reset(model):
       for name in attributes_to_cache:
         attributes_cache[name] = getattr(model, name)
   model._original_attributes_cache = attributes_cache
-  # Reset built state
+  _reset_build_compile_trackers(model)
+  model._setattr_tracking = setattr_tracking
+
+
+def _reset_build_compile_trackers(model):
+  """Reset state trackers for model.
+
+  Note that we do not actually zero out attributes such as optimizer,
+  but instead rely on the expectation that all of the attrs will be
+  over-written on calling build/compile/etc. This is somewhat fragile,
+  insofar as we check elsewhere for the presence of these attributes as
+  evidence of having been built/compiled/etc. Pending a better way to do this,
+  we reset key attributes here to allow building and compiling.
+
+  Args:
+    model: the model that is being reset
+  """
+  # Reset build state
   model.built = False
   model.inputs = None
   model.outputs = None
-  model._setattr_tracking = setattr_tracking
+  # Reset compile state
+  model._is_compiled = False  # pylint:disable=protected-access
+  model.optimizer = None
 
 
 def in_place_subclassed_model_state_restoration(model):
@@ -418,9 +435,7 @@ def in_place_subclassed_model_state_restoration(model):
     model._setattr_tracking = setattr_tracking
   else:
     # Restore to the state of a never-called model.
-    model.built = False
-    model.inputs = None
-    model.outputs = None
+    _reset_build_compile_trackers(model)
 
 
 def clone_and_build_model(
@@ -462,7 +477,10 @@ def clone_and_build_model(
       - cloning a subclassed model with `in_place_reset` set to False.
       - compiling the clone when the original model has not been compiled.
   """
-  if compile_clone and not model.optimizer:
+  # Grab optimizer now, as we reset-in-place for subclassed models, but
+  # want to maintain access to the original optimizer.
+  orig_optimizer = model.optimizer
+  if compile_clone and not orig_optimizer:
     raise ValueError(
         'Error when cloning model: compile_clone was set to True, but the '
         'original model has not been compiled.')
@@ -498,14 +516,14 @@ def clone_and_build_model(
         input_tensors = input_tensors[0]
       clone._set_inputs(input_tensors)
 
-  if compile_clone and model.optimizer:
-    if isinstance(model.optimizer, optimizers.TFOptimizer):
+  if compile_clone:
+    if isinstance(orig_optimizer, optimizers.TFOptimizer):
       optimizer = optimizers.TFOptimizer(
-          model.optimizer.optimizer, optimizer_iterations)
+          orig_optimizer.optimizer, optimizer_iterations)
       K.track_tf_optimizer(optimizer)
     else:
-      optimizer_config = model.optimizer.get_config()
-      optimizer = model.optimizer.__class__.from_config(optimizer_config)
+      optimizer_config = orig_optimizer.get_config()
+      optimizer = orig_optimizer.__class__.from_config(optimizer_config)
       if optimizer_iterations is not None:
         optimizer.iterations = optimizer_iterations
 
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index 0a5f9a7bea03dba27e9c9cef1609b5c469f7147d..3eab10f624a4c36ba423e817b373fccf35ceeda6 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -19,18 +19,21 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
+import functools
 import os
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics
 from tensorflow.python.keras import models
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -52,158 +55,181 @@ class TestModel(keras.Model):
     return self.layer1(x)
 
 
-def sequential_model(add_input_layer, include_input_shape=True):
-  model = keras.models.Sequential()
+def _get_layers(input_shape=(4,), add_input_layer=False):
   if add_input_layer:
-    model.add(keras.layers.InputLayer(input_shape=(4,)))
-    model.add(keras.layers.Dense(4))
-  elif include_input_shape:
-    model.add(keras.layers.Dense(4, input_shape=(4,)))
+    model_layers = [keras.layers.InputLayer(input_shape=input_shape),
+                    keras.layers.Dense(4)]
+  elif input_shape:
+    model_layers = [keras.layers.Dense(4, input_shape=input_shape)]
   else:
-    model.add(keras.layers.Dense(4))
-  model.add(keras.layers.BatchNormalization())
-  model.add(keras.layers.Dropout(0.5))
-  model.add(keras.layers.Dense(4))
-  return model
-
-
-class TestModelCloning(test.TestCase):
-
-  @test_util.run_v1_only('b/120545219')
-  def test_clone_sequential_model(self):
-    with self.cached_session():
-      val_a = np.random.random((10, 4))
-      val_out = np.random.random((10, 4))
-
-      model = sequential_model(False)
-
-    # Everything should work in a new session.
-    keras.backend.clear_session()
-
-    with self.cached_session():
-      # With placeholder creation
-      new_model = keras.models.clone_model(model)
+    model_layers = [keras.layers.Dense(4)]
+
+  model_layers += [
+      keras.layers.BatchNormalization(),
+      keras.layers.Dropout(0.5),
+      keras.layers.Dense(4)]
+
+  return model_layers
+
+
+def _get_model(input_shape=(4,)):
+  model_layers = _get_layers(input_shape=None, add_input_layer=False)
+  return testing_utils.get_model_from_layers(
+      model_layers, input_shape=input_shape)
+
+
+class TestModelCloning(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_all_keras_modes
+  @parameterized.named_parameters([
+      {'testcase_name': 'has_input_layer',
+       'input_shape': (4,),
+       'add_input_layer': True,
+       'share_weights': False},
+      {'testcase_name': 'no_input_layer',
+       'input_shape': None,
+       'add_input_layer': False,
+       'share_weights': False},
+      {'testcase_name': 'has_input_layer_share_weights',
+       'input_shape': (4,),
+       'add_input_layer': True,
+       'share_weights': True},
+      {'testcase_name': 'no_input_layer_share_weights',
+       'input_shape': None,
+       'add_input_layer': False,
+       'share_weights': True},
+  ])
+  def test_clone_sequential_model(
+      self, input_shape, add_input_layer, share_weights):
+
+    if share_weights:
+      clone_fn = functools.partial(
+          keras.models._clone_sequential_model, share_weights=True)
+    else:
+      clone_fn = keras.models.clone_model
+
+    val_a = np.random.random((10, 4))
+    model = models.Sequential(_get_layers(input_shape, add_input_layer))
+    # Sanity check
+    self.assertEqual(
+        isinstance(model._layers[0], keras.layers.InputLayer),
+        add_input_layer)
+    self.assertEqual(model._is_graph_network, add_input_layer)
+
+    # With placeholder creation -- clone model should have an InputLayer
+    # if the original model has one.
+    new_model = clone_fn(model)
+    self.assertEqual(
+        isinstance(new_model._layers[0], keras.layers.InputLayer),
+        add_input_layer)
+    self.assertEqual(new_model._is_graph_network, model._is_graph_network)
+    if input_shape:
       # update ops from batch norm needs to be included
       self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch(val_a, val_out)
-
-      # On top of new tensor
-      input_a = keras.Input(shape=(4,))
-      new_model = keras.models.clone_model(model, input_tensors=input_a)
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch(val_a, val_out)
 
-      # On top of new, non-Keras tensor
+    # On top of new tensor  -- clone model should always have an InputLayer.
+    input_a = keras.Input(shape=(4,))
+    new_model = clone_fn(model, input_tensors=input_a)
+    self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer)
+    self.assertTrue(new_model._is_graph_network)
+
+    # On top of new, non-Keras tensor  -- clone model should always have an
+    # InputLayer.
+    if not context.executing_eagerly():
+      # TODO(b/121277734):Skip Eager contexts, as Input() layers raise an error
+      # saying they should not be used with EagerTensors
       input_a = keras.backend.variable(val_a)
-      new_model = keras.models.clone_model(model, input_tensors=input_a)
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch(None, val_out)
-
-  @test_util.run_v1_only('b/120545219')
-  def test_clone_sequential_model_input_layer(self):
-
-    def test_input_layer(include_inputs):
-      with self.cached_session():
-        val_a = np.random.random((10, 4))
-        model = sequential_model(include_inputs, include_inputs)
-        # Sanity check
-        self.assertEqual(
-            isinstance(model._layers[0], keras.layers.InputLayer),
-            include_inputs)
-        self.assertEqual(model._is_graph_network, include_inputs)
-
-      keras.backend.clear_session()
-      with self.cached_session():
-        # With placeholder creation -- clone model should have an InputLayer
-        # if the original model has one.
-        new_model = keras.models.clone_model(model)
-        self.assertEqual(
-            isinstance(new_model._layers[0], keras.layers.InputLayer),
-            include_inputs)
-        self.assertEqual(new_model._is_graph_network, model._is_graph_network)
-
-        # On top of new tensor  -- clone model should always have an InputLayer.
-        input_a = keras.Input(shape=(4,))
-        new_model = keras.models.clone_model(model, input_tensors=input_a)
-        self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer)
-        self.assertTrue(new_model._is_graph_network)
-
-        # On top of new, non-Keras tensor  -- clone model should always have an
-        # InputLayer.
-        input_a = keras.backend.variable(val_a)
-        new_model = keras.models.clone_model(model, input_tensors=input_a)
-        self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer)
-        self.assertTrue(new_model._is_graph_network)
-
-    test_input_layer(True)
-    test_input_layer(False)
-
-  @test_util.run_v1_only('b/120545219')
-  def test_clone_functional_model(self):
-    with self.cached_session():
-      val_a = np.random.random((10, 4))
-      val_b = np.random.random((10, 4))
-      val_out = np.random.random((10, 4))
-
-      input_a = keras.Input(shape=(4,))
-      input_b = keras.Input(shape=(4,))
-      dense_1 = keras.layers.Dense(4,)
-      dense_2 = keras.layers.Dense(4,)
-
-      x_a = dense_1(input_a)
-      x_a = keras.layers.Dropout(0.5)(x_a)
-      x_a = keras.layers.BatchNormalization()(x_a)
-      x_b = dense_1(input_b)
-      x_a = dense_2(x_a)
-      outputs = keras.layers.add([x_a, x_b])
-      model = keras.models.Model([input_a, input_b], outputs)
-
-    # Everything should work in a new session.
-    keras.backend.clear_session()
-
-    with self.cached_session():
-      # With placeholder creation
-      new_model = keras.models.clone_model(model)
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch([val_a, val_b], val_out)
-
-      # On top of new tensors
-      input_a = keras.Input(shape=(4,), name='a')
-      input_b = keras.Input(shape=(4,), name='b')
-      new_model = keras.models.clone_model(
-          model, input_tensors=[input_a, input_b])
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch([val_a, val_b], val_out)
-
-      # On top of new, non-Keras tensors
+      new_model = clone_fn(model, input_tensors=input_a)
+      self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer)
+      self.assertTrue(new_model._is_graph_network)
+
+  @keras_parameterized.run_all_keras_modes
+  @parameterized.named_parameters([
+      {'testcase_name': 'clone_weights', 'share_weights': False},
+      {'testcase_name': 'share_weights', 'share_weights': True},
+  ])
+  def test_clone_functional_model(self, share_weights):
+    if share_weights:
+      clone_fn = functools.partial(
+          keras.models._clone_functional_model, share_weights=True)
+    else:
+      clone_fn = keras.models.clone_model
+
+    val_a = np.random.random((10, 4))
+    val_b = np.random.random((10, 4))
+    val_out = np.random.random((10, 4))
+
+    input_a = keras.Input(shape=(4,))
+    input_b = keras.Input(shape=(4,))
+    dense_1 = keras.layers.Dense(4,)
+    dense_2 = keras.layers.Dense(4,)
+
+    x_a = dense_1(input_a)
+    x_a = keras.layers.Dropout(0.5)(x_a)
+    x_a = keras.layers.BatchNormalization()(x_a)
+    x_b = dense_1(input_b)
+    x_a = dense_2(x_a)
+    outputs = keras.layers.add([x_a, x_b])
+    model = keras.models.Model([input_a, input_b], outputs)
+
+    # With placeholder creation
+    new_model = clone_fn(model)
+    self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
+    new_model.compile(
+        testing_utils.get_v2_optimizer('rmsprop'), 'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    new_model.train_on_batch([val_a, val_b], val_out)
+
+    # On top of new tensors
+    input_a = keras.Input(shape=(4,), name='a')
+    input_b = keras.Input(shape=(4,), name='b')
+    new_model = keras.models.clone_model(
+        model, input_tensors=[input_a, input_b])
+    self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
+    new_model.compile(
+        testing_utils.get_v2_optimizer('rmsprop'), 'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    new_model.train_on_batch([val_a, val_b], val_out)
+
+    # On top of new, non-Keras tensors
+    if not context.executing_eagerly():
+      # TODO(b/121277734):Skip Eager contexts, as Input() layers raise an error
+      # saying they should not be used with EagerTensors
       input_a = keras.backend.variable(val_a)
       input_b = keras.backend.variable(val_b)
-      new_model = keras.models.clone_model(
-          model, input_tensors=[input_a, input_b])
+      new_model = clone_fn(model, input_tensors=[input_a, input_b])
       self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
+      new_model.compile(
+          testing_utils.get_v2_optimizer('rmsprop'), 'mse',
+          run_eagerly=testing_utils.should_run_eagerly())
       new_model.train_on_batch(None, val_out)
 
-  @test_util.run_in_graph_and_eager_modes
-  def test_clone_functional_model_with_masking(self):
-    with self.cached_session():
-      x = np.array([[[1], [1]], [[0], [0]]])
-      inputs = keras.Input((2, 1))
-      outputs = keras.layers.Masking(mask_value=0)(inputs)
-      outputs = keras.layers.TimeDistributed(
-          keras.layers.Dense(1, kernel_initializer='one'))(outputs)
-      model = keras.Model(inputs, outputs)
-
-      model = keras.models.clone_model(model)
-      model.compile(loss='mse', optimizer=adam.AdamOptimizer(0.01))
-      y = np.array([[[1], [1]], [[1], [1]]])
-      loss = model.train_on_batch(x, y)
-      self.assertEqual(float(loss), 0.)
+  @keras_parameterized.run_all_keras_modes
+  @parameterized.named_parameters([
+      {'testcase_name': 'clone_weights', 'share_weights': False},
+      {'testcase_name': 'share_weights', 'share_weights': True},
+  ])
+  def test_clone_functional_with_masking(self, share_weights):
+    if share_weights:
+      clone_fn = functools.partial(
+          keras.models._clone_functional_model, share_weights=True)
+    else:
+      clone_fn = keras.models.clone_model
+
+    x = np.array([[[1.], [1.]], [[0.], [0.]]])
+    inputs = keras.Input((2, 1))
+    outputs = keras.layers.Masking(mask_value=0)(inputs)
+    outputs = keras.layers.TimeDistributed(
+        keras.layers.Dense(1, kernel_initializer='one'))(outputs)
+    model = keras.Model(inputs, outputs)
+
+    model = clone_fn(model)
+    model.compile(
+        loss='mse', optimizer=testing_utils.get_v2_optimizer('adam'),
+        run_eagerly=testing_utils.should_run_eagerly())
+    y = np.array([[[1], [1]], [[1], [1]]])
+    loss = model.train_on_batch(x, y)
+    self.assertEqual(float(loss), 0.)
 
   def test_model_cloning_invalid_use_cases(self):
     seq_model = keras.models.Sequential()
@@ -249,168 +275,23 @@ class TestModelCloning(test.TestCase):
       self.assertFalse(has_placeholder)
 
 
-class TestModelCloningLayerPreserveWeights(test.TestCase):
-
-  @test_util.run_deprecated_v1
-  def test_clone_sequential_model(self):
-    with self.cached_session():
-      val_a = np.random.random((10, 4))
-      val_out = np.random.random((10, 4))
-
-      model = sequential_model(False)
-
-    # Everything should work in a new session.
-    keras.backend.clear_session()
-
-    with self.cached_session():
-      # With placeholder creation
-      new_model = keras.models._clone_sequential_model(
-          model, share_weights=True)
-      # update ops from batch norm needs to be included
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch(val_a, val_out)
-
-      # On top of new tensor
-      input_a = keras.Input(shape=(4,))
-      new_model = keras.models._clone_sequential_model(
-          model, input_tensors=input_a, share_weights=True)
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch(val_a, val_out)
-
-      # On top of new, non-Keras tensor
-      input_a = keras.backend.variable(val_a)
-      new_model = keras.models._clone_sequential_model(
-          model, input_tensors=input_a, share_weights=True)
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch(None, val_out)
-
-  @test_util.run_deprecated_v1
-  def test_clone_sequential_model_input_layer(self):
-
-    @test_util.run_deprecated_v1
-    def test_input_layer(include_inputs):
-      with self.cached_session():
-        val_a = np.random.random((10, 4))
-        model = sequential_model(include_inputs, include_inputs)
-        # Sanity check
-        self.assertEqual(
-            isinstance(model._layers[0], keras.layers.InputLayer),
-            include_inputs)
-        self.assertEqual(model._is_graph_network, include_inputs)
-
-      keras.backend.clear_session()
-      with self.cached_session():
-        # With placeholder creation -- clone model should have an InputLayer
-        # if the original model has one.
-        new_model = keras.models._clone_sequential_model(
-            model, share_weights=True)
-        self.assertEqual(
-            isinstance(new_model._layers[0], keras.layers.InputLayer),
-            include_inputs)
-        self.assertEqual(new_model._is_graph_network, model._is_graph_network)
-
-        # On top of new tensor  -- clone model should always have an InputLayer.
-        input_a = keras.Input(shape=(4,))
-        new_model = keras.models._clone_sequential_model(
-            model, input_tensors=input_a, share_weights=True)
-        self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer)
-        self.assertTrue(new_model._is_graph_network)
-
-        # On top of new, non-Keras tensor  -- clone model should always have an
-        # InputLayer.
-        input_a = keras.backend.variable(val_a)
-        new_model = keras.models._clone_sequential_model(
-            model, input_tensors=input_a, share_weights=True)
-        self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer)
-        self.assertTrue(new_model._is_graph_network)
-
-    test_input_layer(True)
-    test_input_layer(False)
-
-  @test_util.run_deprecated_v1
-  def test_clone_functional_model(self):
-    with self.cached_session():
-      val_a = np.random.random((10, 4))
-      val_b = np.random.random((10, 4))
-      val_out = np.random.random((10, 4))
-
-      input_a = keras.Input(shape=(4,))
-      input_b = keras.Input(shape=(4,))
-      dense_1 = keras.layers.Dense(4,)
-      dense_2 = keras.layers.Dense(4,)
-
-      x_a = dense_1(input_a)
-      x_a = keras.layers.Dropout(0.5)(x_a)
-      x_a = keras.layers.BatchNormalization()(x_a)
-      x_b = dense_1(input_b)
-      x_a = dense_2(x_a)
-      outputs = keras.layers.add([x_a, x_b])
-      model = keras.models.Model([input_a, input_b], outputs)
-
-    # Everything should work in a new session.
-    keras.backend.clear_session()
-
-    with self.cached_session():
-      # With placeholder creation
-      new_model = keras.models._clone_functional_model(
-          model, share_weights=True)
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch([val_a, val_b], val_out)
-
-      # On top of new tensors
-      input_a = keras.Input(shape=(4,), name='a')
-      input_b = keras.Input(shape=(4,), name='b')
-      new_model = keras.models._clone_functional_model(
-          model, input_tensors=[input_a, input_b], share_weights=True)
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch([val_a, val_b], val_out)
-
-      # On top of new, non-Keras tensors
-      input_a = keras.backend.variable(val_a)
-      input_b = keras.backend.variable(val_b)
-      new_model = keras.models._clone_functional_model(
-          model, input_tensors=[input_a, input_b], share_weights=True)
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch(None, val_out)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_clone_functional_model_with_masking(self):
-    with self.cached_session():
-      x = np.array([[[1], [1]], [[0], [0]]])
-      inputs = keras.Input((2, 1))
-      outputs = keras.layers.Masking(mask_value=0)(inputs)
-      outputs = keras.layers.TimeDistributed(
-          keras.layers.Dense(1, kernel_initializer='one'))(outputs)
-      model = keras.Model(inputs, outputs)
-
-      model = keras.models._clone_functional_model(
-          model, share_weights=True)
-      model.compile(loss='mse', optimizer=adam.AdamOptimizer(0.01))
-      y = np.array([[[1], [1]], [[1], [1]]])
-      loss = model.train_on_batch(x, y)
-      self.assertEqual(float(loss), 0.)
-
-
 def _has_placeholder(graph):
   ops_types = [op.type for op in graph.get_operations()]
   return any('Placeholder' in s for s in ops_types)
 
 
-class CheckpointingTests(test.TestCase):
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes
+class CheckpointingTests(keras_parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
   def test_optimizer_dependency(self):
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(1, input_shape=(4,)))
-    opt = adam.AdamOptimizer(0.01)
-    model.compile(optimizer=opt, loss='mse')
-    model.fit(x=np.array([[1., 2., 3., 4.]]), y=[1.], epochs=2)
+    model = _get_model()
+    opt = adam.AdamOptimizer(.01)
+    model.compile(
+        optimizer=opt, loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    model.fit(x=np.array([[1., 2., 3., 4.]]), y=np.array([1.]), epochs=2)
     save_prefix = os.path.join(self.get_temp_dir(), 'ckpt')
     beta1_power, _ = opt._get_beta_accumulators()
     self.evaluate(beta1_power.assign(12.))
@@ -420,7 +301,8 @@ class CheckpointingTests(test.TestCase):
     self.assertEqual(12., self.evaluate(beta1_power))
 
 
-class TestModelBackend(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class TestModelBackend(keras_parameterized.TestCase):
 
   def test_model_backend_float64_use_cases(self):
     # Test case for GitHub issue 19318
@@ -430,7 +312,9 @@ class TestModelBackend(test.TestCase):
     x = keras.Input((5,))
     y = keras.layers.Dense(1)(x)
     model = keras.models.Model(x, y)
-    model.compile('rmsprop', 'mse')
+    model.compile(
+        testing_utils.get_v2_optimizer('rmsprop'), 'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
 
     keras.backend.set_floatx(floatx)
 
@@ -465,48 +349,46 @@ class TestModelDeepCopy(test.TestCase):
                       model_copy.get_weights()[0]))
 
 
-@test_util.run_v1_only('b/120545219')
-class TestCloneAndBuildModel(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class TestCloneAndBuildModel(keras_parameterized.TestCase):
 
+  @keras_parameterized.run_with_all_model_types
   def test_clone_and_build_non_compiled_model(self):
-    with self.cached_session():
-      inp = np.random.random((10, 4))
-      out = np.random.random((10, 4))
+    inp = np.random.random((10, 4))
+    out = np.random.random((10, 4))
 
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(4, input_shape=(4,)))
-      model.add(keras.layers.BatchNormalization())
-      model.add(keras.layers.Dropout(0.5))
-      model.add(keras.layers.Dense(4))
-
-    # Everything should work in a new session.
-    keras.backend.clear_session()
-
-    with self.cached_session():
-      with self.assertRaisesRegexp(ValueError, 'has not been compiled'):
-        models.clone_and_build_model(model, compile_clone=True)
-
-      # With placeholder creation
-      new_model = models.clone_and_build_model(model, compile_clone=False)
-      with self.assertRaisesRegexp(RuntimeError, 'must compile'):
-        new_model.evaluate(inp, out)
-      with self.assertRaisesRegexp(RuntimeError, 'must compile'):
-        new_model.train_on_batch(inp, out)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch(inp, out)
+    model = _get_model()
+
+    with self.assertRaisesRegexp(ValueError, 'has not been compiled'):
+      models.clone_and_build_model(model, compile_clone=True)
 
-      # Create new tensors for inputs and targets
-      input_a = keras.Input(shape=(4,))
-      target_a = keras.Input(shape=(4,))
-      new_model = models.clone_and_build_model(model, input_tensors=input_a,
-                                               target_tensors=[target_a],
-                                               compile_clone=False)
-      with self.assertRaisesRegexp(RuntimeError, 'must compile'):
-        new_model.evaluate(inp, out)
-      with self.assertRaisesRegexp(RuntimeError, 'must compile'):
-        new_model.train_on_batch(inp, out)
-      new_model.compile('rmsprop', 'mse')
+    is_subclassed = (testing_utils.get_model_type() == 'subclass')
+    # With placeholder creation
+    new_model = models.clone_and_build_model(
+        model, compile_clone=False, in_place_reset=is_subclassed)
+    with self.assertRaisesRegexp(RuntimeError, 'must compile'):
+      new_model.evaluate(inp, out)
+    with self.assertRaisesRegexp(RuntimeError, 'must compile'):
+      new_model.train_on_batch(inp, out)
+    new_model.compile(
+        testing_utils.get_v2_optimizer('rmsprop'), 'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    new_model.train_on_batch(inp, out)
+
+    # Create new tensors for inputs and targets
+    input_a = keras.Input(shape=(4,))
+    target_a = keras.Input(shape=(4,))
+    new_model = models.clone_and_build_model(
+        model, input_tensors=input_a, target_tensors=[target_a],
+        compile_clone=False, in_place_reset=is_subclassed)
+    with self.assertRaisesRegexp(RuntimeError, 'must compile'):
+      new_model.evaluate(inp, out)
+    with self.assertRaisesRegexp(RuntimeError, 'must compile'):
       new_model.train_on_batch(inp, out)
+    new_model.compile(
+        testing_utils.get_v2_optimizer('rmsprop'), 'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    new_model.train_on_batch(inp, out)
 
   def _assert_same_compile_params(self, model):
     """Assert that two models have the same compile parameters."""
@@ -519,134 +401,88 @@ class TestCloneAndBuildModel(test.TestCase):
     self.assertEqual(['acc', metrics.categorical_accuracy],
                      model._compile_metrics)
 
-  def _clone_and_build_test_helper(self, model, is_subclassed=False):
+  def _clone_and_build_test_helper(self, model, model_type):
     inp = np.random.random((10, 4))
     out = np.random.random((10, 4))
 
-    # Everything should work in a new session.
-    keras.backend.clear_session()
-
-    with self.cached_session():
-      # With placeholder creation
-      new_model = models.clone_and_build_model(
-          model, compile_clone=True, in_place_reset=is_subclassed)
+    is_subclassed = (model_type == 'subclass')
+
+    # With placeholder creation
+    new_model = models.clone_and_build_model(
+        model, compile_clone=True, in_place_reset=is_subclassed)
+
+    self._assert_same_compile_params(new_model)
+    new_model.train_on_batch(inp, out)
+    new_model.evaluate(inp, out)
+
+    # Create new tensors for inputs and targets
+    input_a = keras.Input(shape=(4,), name='a')
+    new_model = models.clone_and_build_model(
+        model, input_tensors=input_a, compile_clone=True,
+        in_place_reset=is_subclassed)
+    self._assert_same_compile_params(new_model)
+    new_model.train_on_batch(inp, out)
+    new_model.evaluate(inp, out)
+
+    target_a = keras.Input(shape=(4,), name='b')
+    new_model = models.clone_and_build_model(
+        model, input_tensors=input_a, target_tensors=[target_a],
+        compile_clone=True, in_place_reset=is_subclassed)
+    self._assert_same_compile_params(new_model)
+    new_model.train_on_batch(inp, out)
+    new_model.evaluate(inp, out)
+
+  @keras_parameterized.run_with_all_model_types
+  def test_clone_and_build_compiled(self):
+    model = _get_model()
+    model.compile(
+        testing_utils.get_v2_optimizer('rmsprop'), 'mse',
+        metrics=['acc', metrics.categorical_accuracy],
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    self._clone_and_build_test_helper(model, testing_utils.get_model_type())
+
+  def test_clone_and_build_sequential_without_inputs_defined(self):
+    model = models.Sequential(_get_layers(input_shape=None))
+    model.compile(
+        testing_utils.get_v2_optimizer('rmsprop'),
+        'mse', metrics=['acc', metrics.categorical_accuracy],
+        run_eagerly=testing_utils.should_run_eagerly())
+    self._clone_and_build_test_helper(model, 'sequential')
 
-      self._assert_same_compile_params(new_model)
-      new_model.train_on_batch(inp, out)
-      new_model.evaluate(inp, out)
-
-      # Create new tensors for inputs and targets
-      input_a = keras.Input(shape=(4,), name='a')
-      new_model = models.clone_and_build_model(
-          model, input_tensors=input_a, compile_clone=True,
-          in_place_reset=is_subclassed)
-      self._assert_same_compile_params(new_model)
-      new_model.train_on_batch(inp, out)
-      new_model.evaluate(inp, out)
-
-      target_a = keras.Input(shape=(4,), name='b')
-      new_model = models.clone_and_build_model(
-          model, input_tensors=input_a, target_tensors=[target_a],
-          compile_clone=True, in_place_reset=is_subclassed)
-      self._assert_same_compile_params(new_model)
-      new_model.train_on_batch(inp, out)
-      new_model.evaluate(inp, out)
-
-  def test_clone_and_build_compiled_sequential_model(self):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(4, input_shape=(4,)))
-      model.add(keras.layers.BatchNormalization())
-      model.add(keras.layers.Dropout(0.5))
-      model.add(keras.layers.Dense(4))
-      model.compile('rmsprop', 'mse',
-                    metrics=['acc', metrics.categorical_accuracy])
-
-    self._clone_and_build_test_helper(model)
-
-  def test_clone_and_build_functional_model(self):
-    with self.cached_session():
-      input_a = keras.Input(shape=(4,))
-      dense_1 = keras.layers.Dense(4,)
-      dense_2 = keras.layers.Dense(4,)
-
-      x_a = dense_1(input_a)
-      x_a = keras.layers.Dropout(0.5)(x_a)
-      x_a = keras.layers.BatchNormalization()(x_a)
-      x_a = dense_2(x_a)
-      model = keras.models.Model(input_a, x_a)
-      model.compile('rmsprop', 'mse',
-                    metrics=['acc', metrics.categorical_accuracy])
-
-    self._clone_and_build_test_helper(model)
-
-  def test_clone_and_build_subclassed_model(self):
-    class SubclassedModel(keras.Model):
-
-      def __init__(self):
-        super(SubclassedModel, self).__init__()
-        self.layer1 = keras.layers.Dense(4)
-        self.layer2 = keras.layers.Dense(4)
-
-      def call(self, inp):
-        out = self.layer1(inp)
-        out = keras.layers.BatchNormalization()(out)
-        out = keras.layers.Dropout(0.5)(out)
-        out = self.layer2(out)
-        return out
-
-    with self.cached_session():
-      model = SubclassedModel()
-      model.compile('rmsprop', 'mse',
-                    metrics=['acc', metrics.categorical_accuracy])
-    self._clone_and_build_test_helper(model, True)
+    inp = np.random.random((10, 4))
+    out = np.random.random((10, 4))
+    model.train_on_batch(inp, out)
+    self._clone_and_build_test_helper(model, 'sequential')
 
   def assert_optimizer_iterations_increases(self, optimizer):
-    with self.cached_session():
-      input_a = keras.Input(shape=(4,))
-      dense_1 = keras.layers.Dense(4,)
-      dense_2 = keras.layers.Dense(4,)
-
-      x_a = dense_1(input_a)
-      x_a = keras.layers.Dropout(0.5)(x_a)
-      x_a = keras.layers.BatchNormalization()(x_a)
-      x_a = dense_2(x_a)
-      model = keras.models.Model(input_a, x_a)
-      model.compile(optimizer, 'mse',
-                    metrics=['acc', metrics.categorical_accuracy])
+    model = _get_model()
+    model.compile(
+        optimizer, 'mse', metrics=['acc', metrics.categorical_accuracy],
+        run_eagerly=testing_utils.should_run_eagerly())
 
-      global_step = keras.backend.variable(123, dtype=dtypes.int64)
-      clone_model = models.clone_and_build_model(
-          model, compile_clone=True, optimizer_iterations=global_step)
+    global_step = keras.backend.variable(123, dtype=dtypes.int64)
+    clone_model = models.clone_and_build_model(
+        model, compile_clone=True, optimizer_iterations=global_step,
+        in_place_reset=(testing_utils.get_model_type() == 'subclass'))
 
-      inp = np.random.random((10, 4))
-      out = np.random.random((10, 4))
-      clone_model.train_on_batch(inp, out)
+    inp = np.random.random((10, 4))
+    out = np.random.random((10, 4))
+    clone_model.train_on_batch(inp, out)
 
-      self.assertEqual(K.eval(global_step), 124)
+    self.assertEqual(K.eval(global_step), 124)
 
+  @keras_parameterized.run_with_all_model_types
   def test_replace_tf_optimizer_iterations_variable(self):
     self.assert_optimizer_iterations_increases(adam.AdamOptimizer(0.01))
 
+  @keras_parameterized.run_with_all_model_types
   def test_replace_keras_optimizer_iterations_variable(self):
-    self.assert_optimizer_iterations_increases('adam')
+    if testing_utils.should_run_eagerly():
+      # This needs to be updated to run with v2 optimizers.
+      self.skipTest('b/120991591')
 
-  def test_replace_keras_optimizer_v2_iterations_variable(self):
-    self.assert_optimizer_iterations_increases(
-        keras.optimizer_v2.adam.Adam(0.01))
-
-  def test_clone_and_build_sequential_model_without_inputs_defined(self):
-    with self.cached_session():
-      model = sequential_model(False, False)
-      model.compile('rmsprop', 'mse',
-                    metrics=['acc', metrics.categorical_accuracy])
-    self._clone_and_build_test_helper(model, False)
-
-    with self.cached_session():
-      inp = np.random.random((10, 4))
-      out = np.random.random((10, 4))
-      model.train_on_batch(inp, out)
-    self._clone_and_build_test_helper(model, False)
+    self.assert_optimizer_iterations_increases('adam')
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
index b8f01249419c595a735442310c735bc10648cba6..da757edc74b458588c4351a7d5fab9d7255e2f4f 100644
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -34,6 +34,7 @@ py_library(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:values",
     ],
 )
 
@@ -172,9 +173,9 @@ cuda_py_test(
 
 py_test(
     name = "optimizer_v2_test",
-    size = "large",
+    size = "medium",
     srcs = ["optimizer_v2_test.py"],
-    shard_count = 4,
+    shard_count = 8,
     tags = [
         "no_windows",
     ],
@@ -212,4 +213,5 @@ cuda_py_test(
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variables",
     ],
+    shard_count = 2,
 )
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta.py b/tensorflow/python/keras/optimizer_v2/adadelta.py
index 8f485b2440e497b708c4f8a40f2b1fe60a612257..cb9b7d1015ce921eaf3db51ba989ee7a897eb72b 100644
--- a/tensorflow/python/keras/optimizer_v2/adadelta.py
+++ b/tensorflow/python/keras/optimizer_v2/adadelta.py
@@ -77,7 +77,11 @@ class Adadelta(optimizer_v2.OptimizerV2):
                to better conditioning the grad update.
       name: Optional name prefix for the operations created when applying
         gradients.  Defaults to "Adadelta".
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
 
     @compatibility(eager)
     When eager execution is enabled, `learning_rate`, `rho`, and `epsilon` can
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad.py b/tensorflow/python/keras/optimizer_v2/adagrad.py
index af359b5f591186641e483aa0dc30a734b3aee62f..7e4a153678f919be71a4af1d8ee08ef09a8b32ce 100644
--- a/tensorflow/python/keras/optimizer_v2/adagrad.py
+++ b/tensorflow/python/keras/optimizer_v2/adagrad.py
@@ -70,7 +70,11 @@ class Adagrad(optimizer_v2.OptimizerV2):
         Starting value for the accumulators, must be positive.
       name: Optional name prefix for the operations created when applying
         gradients.  Defaults to "Adagrad".
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
 
     Raises:
       ValueError: If the `initial_accumulator_value` or `epsilon` is invalid.
diff --git a/tensorflow/python/keras/optimizer_v2/adam.py b/tensorflow/python/keras/optimizer_v2/adam.py
index 292323be60a769e8330085b89627c66ec027bd87..a05bc7e96cd41032860f0615b502b62e0d34c5a4 100644
--- a/tensorflow/python/keras/optimizer_v2/adam.py
+++ b/tensorflow/python/keras/optimizer_v2/adam.py
@@ -21,7 +21,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import keras_export
@@ -125,7 +124,11 @@ class Adam(optimizer_v2.OptimizerV2):
         a callable that takes no arguments and returns the actual value to use.
         This can be useful for changing these values across different
         invocations of optimizer functions. @end_compatibility
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
     """
 
     super(Adam, self).__init__(name, **kwargs)
@@ -240,11 +243,6 @@ class Adam(optimizer_v2.OptimizerV2):
           use_locking=self._use_locking)
       return control_flow_ops.group(*[var_update, m_t, v_t, v_hat_t])
 
-  def _resource_scatter_add(self, x, i, v):
-    with ops.control_dependencies(
-        [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
-      return x.value()
-
   def get_config(self):
     config = super(Adam, self).get_config()
     config.update({
diff --git a/tensorflow/python/keras/optimizer_v2/adamax.py b/tensorflow/python/keras/optimizer_v2/adamax.py
index 8ee5c2a9f890141a87651d712b727a4cfa4e5696..4a4660d32830540993db9a160f837aada55eb4cd 100644
--- a/tensorflow/python/keras/optimizer_v2/adamax.py
+++ b/tensorflow/python/keras/optimizer_v2/adamax.py
@@ -19,17 +19,16 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
-from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import keras_export
 
 
 @keras_export('keras.optimizers.Adamax', v1=[])
-class Adamax(adam.Adam):
+class Adamax(optimizer_v2.OptimizerV2):
   """Optimizer that implements the Adamax algorithm.
 
   It is a variant of Adam based on the infinity norm.
@@ -90,18 +89,25 @@ class Adamax(adam.Adam):
       epsilon: A small constant for numerical stability.
       name: Optional name for the operations created when applying gradients.
         Defaults to "Adamax".
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
     """
-    # pylint: disable=useless-super-delegation
-    super(Adamax, self).__init__(
-        learning_rate=learning_rate,
-        beta_1=beta_1,
-        beta_2=beta_2,
-        epsilon=epsilon,
-        amsgrad=False,
-        name=name,
-        **kwargs)
-    # pylint: enable=useless-super-delegation
+    super(Adamax, self).__init__(name, **kwargs)
+    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
+    self._set_hyper('decay', self._initial_decay)
+    self._set_hyper('beta_1', beta_1)
+    self._set_hyper('beta_2', beta_2)
+    self._set_hyper('epsilon', epsilon)
+
+  def _create_slots(self, var_list):
+    # Separate for-loops to respect the ordering of slot variables from v1.
+    for var in var_list:
+      self.add_slot(var, 'm')  # Create slots for the first moments.
+    for var in var_list:
+      self.add_slot(var, 'v')  # Create slots for the second moments.
 
   def _resource_apply_dense(self, grad, var):
     var_dtype = var.dtype.base_dtype
@@ -154,8 +160,13 @@ class Adamax(adam.Adam):
       var_update = self._resource_scatter_add(var, indices, var_slice)
     return control_flow_ops.group(*[var_update, m_t, v_t])
 
-  def _resource_scatter_update(self, x, i, v):
-    with ops.control_dependencies(
-        [resource_variable_ops.resource_scatter_update(
-            x.handle, i, v)]):
-      return x.value()
+  def get_config(self):
+    config = super(Adamax, self).get_config()
+    config.update({
+        'learning_rate': self._serialize_hyperparameter('learning_rate'),
+        'decay': self._serialize_hyperparameter('decay'),
+        'beta_1': self._serialize_hyperparameter('beta_1'),
+        'beta_2': self._serialize_hyperparameter('beta_2'),
+        'epsilon': self._serialize_hyperparameter('epsilon'),
+    })
+    return config
diff --git a/tensorflow/python/keras/optimizer_v2/ftrl.py b/tensorflow/python/keras/optimizer_v2/ftrl.py
index 5783fb12b36081fee62d5a693eccc4cab676e6d8..15515616b24bd4bc6246ae3699ae210c7e9edf65 100644
--- a/tensorflow/python/keras/optimizer_v2/ftrl.py
+++ b/tensorflow/python/keras/optimizer_v2/ftrl.py
@@ -72,7 +72,11 @@ class Ftrl(optimizer_v2.OptimizerV2):
                   2*L2_shrinkage*lr_t / (1 + 2*L2*lr_t) * w_t
         where lr_t is the learning rate at t.
         When input is sparse shrinkage will only happen on the active weights.\
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
 
     Raises:
       ValueError: If one of the arguments is invalid.
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent.py b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
index 2e64e080954fc64b86a8ce8be750369e228f43fa..adb4ae2bf8674ba96338771a91feac9c4436f22c 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
@@ -74,7 +74,11 @@ class SGD(optimizer_v2.OptimizerV2):
       nesterov: boolean. Whether to apply Nesterov momentum.
       name: Optional name prefix for the operations created when applying
         gradients.  Defaults to 'SGD'.
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
     """
     super(SGD, self).__init__(name, **kwargs)
     self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
diff --git a/tensorflow/python/keras/optimizer_v2/nadam.py b/tensorflow/python/keras/optimizer_v2/nadam.py
index afa74c8de37665ea217fa55cbdea3dda86908f55..7141c6a0792ac820aace9f6d54c1ae7bbb34e18e 100644
--- a/tensorflow/python/keras/optimizer_v2/nadam.py
+++ b/tensorflow/python/keras/optimizer_v2/nadam.py
@@ -18,15 +18,16 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
-from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import keras_export
 
 
-class Nadam(adam.Adam):
+@keras_export('keras.optimizers.Nadam', v1=[])
+class Nadam(optimizer_v2.OptimizerV2):
   r"""Optimizer that implements the NAdam algorithm.
 
   Much like Adam is essentially RMSprop with momentum, Nadam is Adam with
@@ -34,17 +35,21 @@ class Nadam(adam.Adam):
 
   Initialization:
 
-  $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
-  $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
+  $$m_0 := 0 \text{(Initialize 1st moment vector)}$$
+  $$v_0 := 0 \text{(Initialize 2nd moment vector)}$$
+  $$mu_0 := 1$$
   $$t := 0 \text{(Initialize timestep)}$$
 
   Computes:
   $$t := t + 1$$
-  $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
-  $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
-  $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-  $$m_bar_t := beta_1 * v_t + (1 - beta_1) * g$$
-  $$theta_t := theta_{t-1} - lr_t * m_bar_t / (\sqrt{v_t} + \epsilon)$$
+  $$\mu_t := \beta_1 * (1 - 0.5 * 0.96^{0.004 * t})$$
+  $$g' := g / (1 - \prod_{i=1}^{t}{\mu_i})$$
+  $$m_t := \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
+  $$m' := m_t / (1 - \prod_{i=1}^{t+1}{\mu_i})$$
+  $$v_t := \beta_2 * v_{t-1} + (1 - \beta_2) * g * g$$
+  $$v' := v_t / (1 - \beta_2^t)$$
+  $$\bar{m} := (1 - \mu_t) * g' + \mu_{t+1} * m'$$
+  $$\theta_t := \theta_{t-1} - lr * \bar{m} / (\sqrt{v'} + \epsilon)$$
 
   gradient is evaluated at theta(t) + momentum * v(t), and the variables always
   store theta + beta_1 * m / sqrt(v) instead of theta.
@@ -71,57 +76,87 @@ class Nadam(adam.Adam):
       epsilon: A small constant for numerical stability.
       name: Optional name for the operations created when applying gradients.
         Defaults to "Adamax".
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
     """
 
     # Backwards compatiblity with keras NAdam optimizer.
-    if 'schedule_decay' in kwargs:
-      kwargs['decay'] = kwargs.pop('schedule_decay')
-    # pylint: disable=useless-super-delegation
-    super(Nadam, self).__init__(
-        learning_rate=learning_rate,
-        beta_1=beta_1,
-        beta_2=beta_2,
-        epsilon=epsilon,
-        amsgrad=False,
-        name=name,
-        **kwargs)
-    # pylint: enable=useless-super-delegation
+    kwargs['decay'] = kwargs.pop('schedule_decay', 0.004)
+    super(Nadam, self).__init__(name, **kwargs)
+    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
+    self._set_hyper('decay', self._initial_decay)
+    self._set_hyper('beta_1', beta_1)
+    self._set_hyper('beta_2', beta_2)
+    self._set_hyper('epsilon', epsilon)
+    self._m_cache = None
+
+  def _create_slots(self, var_list):
+    var_dtype = var_list[0].dtype.base_dtype
+    if self._m_cache is None:
+      self._m_cache = self.add_weight(
+          'momentum_cache',
+          shape=[],
+          dtype=var_dtype,
+          initializer='ones',
+          trainable=False)
+      self._weights.append(self._m_cache)
+    # Separate for-loops to respect the ordering of slot variables from v1.
+    for var in var_list:
+      # Create slots for the first moments.
+      self.add_slot(var, 'm')
+    for var in var_list:
+      # Create slots for the second moments.
+      self.add_slot(var, 'v')
+
+  def _prepare(self, var_list):
+    var_dtype = var_list[0].dtype.base_dtype
+    beta_1_t = self._get_hyper('beta_1', var_dtype)
+    local_step = math_ops.cast(self.iterations + 1, var_dtype)
+    decay_base = math_ops.cast(0.96, var_dtype)
+    self.m_cache_t = beta_1_t * (
+        1. - 0.5 * (math_ops.pow(decay_base, self._initial_decay * local_step)))
+    self.m_cache_t_1 = beta_1_t * (
+        1. - 0.5 *
+        (math_ops.pow(decay_base, self._initial_decay * (local_step + 1))))
+    m_schedule_new = self._m_cache * self.m_cache_t
+    self.m_schedule_new = state_ops.assign(
+        self._m_cache, m_schedule_new, use_locking=self._use_locking)
+    self.m_schedule_next = self.m_schedule_new * self.m_cache_t_1
 
   def _resource_apply_dense(self, grad, var):
     var_dtype = var.dtype.base_dtype
-    lr_t = self._decayed_lr(var_dtype)
+    lr_t = self._get_hyper('learning_rate', var_dtype)
+    epsilon_t = self._get_hyper('epsilon', var_dtype)
     m = self.get_slot(var, 'm')
     v = self.get_slot(var, 'v')
     beta_1_t = self._get_hyper('beta_1', var_dtype)
     beta_2_t = self._get_hyper('beta_2', var_dtype)
     local_step = math_ops.cast(self.iterations + 1, var_dtype)
-    beta_1_power = math_ops.pow(beta_1_t, local_step)
-    beta_2_power = math_ops.pow(beta_2_t, local_step)
-    return training_ops.resource_apply_adam(
-        var.handle,
-        m.handle,
-        v.handle,
-        beta_1_power,
-        beta_2_power,
-        lr_t,
-        beta_1_t,
-        beta_2_t,
-        self._get_hyper('epsilon', var_dtype),
-        grad,
-        use_locking=self._use_locking,
-        use_nesterov=True)
+
+    g_prime = grad / (1. - self.m_schedule_new)
+    m_t = beta_1_t * m + (1 - beta_1_t) * grad
+    m_t = state_ops.assign(m, m_t, use_locking=self._use_locking)
+    m_t_prime = m_t / (1. - self.m_schedule_next)
+    v_t = beta_2_t * v + (1 - beta_2_t) * math_ops.square(grad)
+    v_t = state_ops.assign(v, v_t, use_locking=self._use_locking)
+    v_t_prime = v_t / (1. - math_ops.pow(beta_2_t, local_step))
+    m_t_bar = (1. - self.m_cache_t) * g_prime + self.m_cache_t_1 * m_t_prime
+    var_t = var - lr_t * m_t_bar / (math_ops.sqrt(v_t_prime) + epsilon_t)
+    return state_ops.assign(var, var_t, use_locking=self._use_locking).op
 
   def _resource_apply_sparse(self, grad, var, indices):
     var_dtype = var.dtype.base_dtype
-    lr_t = self._decayed_lr(var_dtype)
+    lr_t = self._get_hyper('learning_rate', var_dtype)
+    epsilon_t = self._get_hyper('epsilon', var_dtype)
+    v = self.get_slot(var, 'v')
     beta_1_t = self._get_hyper('beta_1', var_dtype)
     beta_2_t = self._get_hyper('beta_2', var_dtype)
     local_step = math_ops.cast(self.iterations + 1, var_dtype)
-    beta_1_power = math_ops.pow(beta_1_t, local_step)
-    beta_2_power = math_ops.pow(beta_2_t, local_step)
-    epsilon_t = self._get_hyper('epsilon', var_dtype)
-    lr = (lr_t * math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power))
+
+    g_prime = grad / (1. - self.m_schedule_new)
 
     # m_t = beta1 * m + (1 - beta1) * g_t
     m = self.get_slot(var, 'm')
@@ -129,8 +164,10 @@ class Nadam(adam.Adam):
     m_t = state_ops.assign(m, m * beta_1_t, use_locking=self._use_locking)
     with ops.control_dependencies([m_t]):
       m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
-      # m_bar = (1 - beta1) * g_t + beta1 * m_t
-      m_bar = m_scaled_g_values + beta_1_t * array_ops.gather(m_t, indices)
+      m_t_slice = array_ops.gather(m_t, indices)
+
+    m_t_prime = m_t_slice / (1. - self.m_schedule_next)
+    m_t_bar = (1. - self.m_cache_t) * g_prime + self.m_cache_t_1 * m_t_prime
 
     # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
     v = self.get_slot(var, 'v')
@@ -138,9 +175,22 @@ class Nadam(adam.Adam):
     v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking)
     with ops.control_dependencies([v_t]):
       v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
-
-    v_t_slice = array_ops.gather(v_t, indices)
-    v_sqrt = math_ops.sqrt(v_t_slice)
-    var_update = self._resource_scatter_add(var, indices,
-                                            -lr * m_bar / (v_sqrt + epsilon_t))
-    return control_flow_ops.group(*[var_update, m_bar, v_t])
+      v_t_slice = array_ops.gather(v_t, indices)
+
+    v_t_prime = v_t_slice / (1. - math_ops.pow(beta_2_t, local_step))
+    v_prime_sqrt = math_ops.sqrt(v_t_prime)
+
+    var_update = self._resource_scatter_add(
+        var, indices, -lr_t * m_t_bar / (v_prime_sqrt + epsilon_t))
+    return control_flow_ops.group(*[var_update, m_t_bar, v_t])
+
+  def get_config(self):
+    config = super(Nadam, self).get_config()
+    config.update({
+        'learning_rate': self._serialize_hyperparameter('learning_rate'),
+        'decay': self._serialize_hyperparameter('decay'),
+        'beta_1': self._serialize_hyperparameter('beta_1'),
+        'beta_2': self._serialize_hyperparameter('beta_2'),
+        'epsilon': self._serialize_hyperparameter('epsilon'),
+    })
+    return config
diff --git a/tensorflow/python/keras/optimizer_v2/nadam_test.py b/tensorflow/python/keras/optimizer_v2/nadam_test.py
index 73568e81f0c6ae680226a123c0098e56a131e826..44fad751bf441603c7571da550657455d4bdb1ef 100644
--- a/tensorflow/python/keras/optimizer_v2/nadam_test.py
+++ b/tensorflow/python/keras/optimizer_v2/nadam_test.py
@@ -40,45 +40,54 @@ def get_beta_accumulators(opt, dtype):
   return (beta_1_power, beta_2_power)
 
 
+def update_m_cache(m_cache, t, beta1=0.9):
+  mu_t = beta1 * (1 - 0.5 * 0.96**(0.004 * (t + 1)))
+  m_cache_t = m_cache * mu_t
+  return m_cache_t
+
+
 def nadam_update_numpy(param,
                        g_t,
                        t,
                        m,
                        v,
+                       m_cache,
                        alpha=0.001,
                        beta1=0.9,
                        beta2=0.999,
                        epsilon=1e-8):
-  alpha_t = alpha * np.sqrt(1 - beta2**(t + 1)) / (1 - beta1**(t + 1))
 
+  mu_t = beta1 * (1 - 0.5 * 0.96**(0.004 * (t + 1)))
+  mu_t_1 = beta1 * (1 - 0.5 * 0.96**(0.004 * (t + 2)))
+  m_cache_t_1 = m_cache * mu_t_1
+  g_prime_t = g_t / (1 - m_cache)
   m_t = beta1 * m + (1 - beta1) * g_t
   v_t = beta2 * v + (1 - beta2) * g_t * g_t
 
-  m_bar = (1 - beta1) * g_t + beta1 * m_t
+  m_prime_t = m_t / (1 - m_cache_t_1)
+  v_prime_t = v_t / (1 - beta2**(t + 1))
+  m_bar_t = (1 - mu_t) * g_prime_t + mu_t_1 * m_prime_t
 
-  param_t = param - alpha_t * m_bar / (np.sqrt(v_t) + epsilon)
+  param_t = param - alpha * m_bar_t / (np.sqrt(v_prime_t) + epsilon)
   return param_t, m_t, v_t
 
 
 class NadamOptimizerTest(test.TestCase):
 
-  def doTestSparse(self, use_resource=False):
+  @test_util.run_deprecated_v1
+  def testSparse(self):
     sparse_epsilon = 1e-7
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
         # Initialize variables for numpy implementation.
-        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        m0, v0, m1, v1, mcache = 0.0, 0.0, 0.0, 0.0, 1.0
         var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
         grads0_np = np.array([0.1, 0, 0.1], dtype=dtype.as_numpy_dtype)
         var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
         grads1_np = np.array([0.01, 0, 0.01], dtype=dtype.as_numpy_dtype)
 
-        if use_resource:
-          var0 = resource_variable_ops.ResourceVariable(var0_np)
-          var1 = resource_variable_ops.ResourceVariable(var1_np)
-        else:
-          var0 = variables.Variable(var0_np)
-          var1 = variables.Variable(var1_np)
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
         grads0_np_indices = np.array([0, 2], dtype=np.int32)
         grads0 = ops.IndexedSlices(
             constant_op.constant(grads0_np[grads0_np_indices]),
@@ -103,74 +112,22 @@ class NadamOptimizerTest(test.TestCase):
           self.assertAllCloseAccordingToType(0.999**(t + 1), beta2_power.eval())
           update.run()
 
+          mcache = update_m_cache(mcache, t)
           var0_np, m0, v0 = nadam_update_numpy(
-              var0_np, grads0_np, t, m0, v0, epsilon=sparse_epsilon)
+              var0_np, grads0_np, t, m0, v0, mcache, epsilon=sparse_epsilon)
           var1_np, m1, v1 = nadam_update_numpy(
-              var1_np, grads1_np, t, m1, v1, epsilon=sparse_epsilon)
+              var1_np, grads1_np, t, m1, v1, mcache, epsilon=sparse_epsilon)
 
           # Validate updated params
           self.assertAllCloseAccordingToType(var0_np, var0.eval())
           self.assertAllCloseAccordingToType(var1_np, var1.eval())
 
   @test_util.run_deprecated_v1
-  def testSparse(self):
-    self.doTestSparse(use_resource=False)
-
-  @test_util.run_deprecated_v1
-  def testResourceSparse(self):
-    self.doTestSparse(use_resource=True)
-
-  def doTestBasic(self, use_resource=False):
+  def testBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
         # Initialize variables for numpy implementation.
-        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-
-        if use_resource:
-          var0 = resource_variable_ops.ResourceVariable(var0_np)
-          var1 = resource_variable_ops.ResourceVariable(var1_np)
-        else:
-          var0 = variables.Variable(var0_np)
-          var1 = variables.Variable(var1_np)
-        grads0 = constant_op.constant(grads0_np)
-        grads1 = constant_op.constant(grads1_np)
-        opt = nadam.Nadam()
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
-
-        beta1_power, beta2_power = get_beta_accumulators(opt, dtype)
-
-        # Run 3 steps of Nadam
-        for t in range(3):
-          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**(t + 1), beta2_power.eval())
-          update.run()
-
-          var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0)
-          var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
-
-  @test_util.run_deprecated_v1
-  def testResourceBasic(self):
-    self.doTestBasic(use_resource=True)
-
-  @test_util.run_deprecated_v1
-  def testBasicWithLearningRateDecay(self):
-    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-        # Initialize variables for numpy implementation.
-        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        m0, v0, m1, v1, mcache = 0.0, 0.0, 0.0, 0.0, 1.0
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
         grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
@@ -180,9 +137,7 @@ class NadamOptimizerTest(test.TestCase):
         var1 = resource_variable_ops.ResourceVariable(var1_np)
         grads0 = constant_op.constant(grads0_np)
         grads1 = constant_op.constant(grads1_np)
-        learning_rate = 0.001
-        decay = 0.5
-        opt = nadam.Nadam(learning_rate=learning_rate, decay=decay)
+        opt = nadam.Nadam()
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
@@ -190,19 +145,15 @@ class NadamOptimizerTest(test.TestCase):
         self.assertAllClose([1.0, 2.0], var0.eval())
         self.assertAllClose([3.0, 4.0], var1.eval())
 
-        beta1_power, beta2_power = get_beta_accumulators(opt, dtype)
-
         # Run 3 steps of Nadam
         for t in range(3):
-          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**(t + 1), beta2_power.eval())
           update.run()
 
-          lr = learning_rate / (1 + decay * t)
-          var0_np, m0, v0 = nadam_update_numpy(
-              var0_np, grads0_np, t, m0, v0, alpha=lr)
-          var1_np, m1, v1 = nadam_update_numpy(
-              var1_np, grads1_np, t, m1, v1, alpha=lr)
+          mcache = update_m_cache(mcache, t)
+          var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0,
+                                               mcache)
+          var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1,
+                                               mcache)
 
           # Validate updated params
           self.assertAllCloseAccordingToType(var0_np, var0.eval())
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index d0f16f0b4f340b5dbb088171427b3823894d6e34..894af66f5dbb7d95c190d590d1ba5130df39b2b2 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -28,6 +28,7 @@ import six
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
+from tensorflow.python.distribute import values as distributed_values
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
@@ -39,6 +40,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -155,13 +157,26 @@ class OptimizerV2(checkpointable.CheckpointableBase):
     Args:
       name: A non-empty string.  The name to use for accumulators created
         for the optimizer.
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
 
     Raises:
       ValueError: If name is malformed.
       RuntimeError: If _create_slots has been overridden instead of
           _create_vars.
     """
+    allowed_kwargs = {"clipnorm", "clipvalue", "lr", "decay"}
+    for k in kwargs:
+      if k not in allowed_kwargs:
+        raise TypeError("Unexpected keyword argument "
+                        "passed to optimizer: " + str(k))
+      # checks that all keyword arguments are non-negative.
+      if kwargs[k] < 0:
+        raise ValueError("Expected {} >= 0, received: {}".format(k, kwargs[k]))
+
     self._use_locking = True
     self._name = name
     self._hyper = {}
@@ -183,9 +198,12 @@ class OptimizerV2(checkpointable.CheckpointableBase):
     if decay < 0.:
       raise ValueError("decay cannot be less than 0: {}".format(decay))
     self._initial_decay = decay
-    self.__dict__.update(kwargs)
+    if "clipnorm" in kwargs:
+      self.clipnorm = kwargs.pop("clipnorm")
+    if "clipvalue" in kwargs:
+      self.clipvalue = kwargs.pop("clipvalue")
 
-    self._prepared = False
+    self._hypers_created = False
 
   def minimize(self, loss, var_list, grad_loss=None, name=None):
     """Add operations to minimize `loss` by updating `var_list`.
@@ -272,8 +290,7 @@ class OptimizerV2(checkpointable.CheckpointableBase):
   @staticmethod
   def _scale_loss(loss_value):
     if distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN:
-      num_replicas = \
-        distribute_ctx.get_distribution_strategy().num_replicas_in_sync
+      num_replicas = distribute_ctx.get_strategy().num_replicas_in_sync
       if num_replicas > 1:
         loss_value *= (1. / num_replicas)
     return loss_value
@@ -331,15 +348,17 @@ class OptimizerV2(checkpointable.CheckpointableBase):
     """
     grads_and_vars = _filter_grads(grads_and_vars)
     var_list = [v for (_, v) in grads_and_vars]
-    if distribute_ctx.has_distribution_strategy():
+    if distribute_ctx.has_strategy():
       reduced_grads = merge_grads(grads_and_vars)
       grads_and_vars = zip(reduced_grads, var_list)
 
-    self._prepare()
+    self._create_hypers()
     with ops.init_scope():
       self._create_slots(var_list)
     update_ops = []
 
+    self._prepare(var_list)
+
     def update_grad_to_var(grad, var):
       """Apply gradient to variable."""
       if isinstance(var, ops.Tensor):
@@ -461,8 +480,11 @@ class OptimizerV2(checkpointable.CheckpointableBase):
     slot_dict = self._slots[var_key]
     return slot_dict[slot_name]
 
-  def _prepare(self):
-    if self._prepared:
+  def _prepare(self, var_list):
+    pass
+
+  def _create_hypers(self):
+    if self._hypers_created:
       return
     if self._iterations is None:
       with ops.device("cpu:0"):
@@ -483,18 +505,18 @@ class OptimizerV2(checkpointable.CheckpointableBase):
             trainable=False,
             initializer=value,
             aggregation=tf_variables.VariableAggregation.ONLY_FIRST_REPLICA)
-    self._prepared = True
+    self._hypers_created = True
 
   @property
   def iterations(self):
     """Variable. The number of training steps this Optimizer has run."""
-    if not self._prepared:
-      self._prepare()
+    if not self._hypers_created:
+      self._create_hypers()
     return self._iterations
 
   @iterations.setter
   def iterations(self, variable):
-    if self._prepared:
+    if self._hypers_created:
       raise RuntimeError("Cannot set `iterations` to a new Variable after"
                          "the Optimizer weights have been created")
     self._iterations = variable
@@ -554,7 +576,8 @@ class OptimizerV2(checkpointable.CheckpointableBase):
     value = self._get_hyper(hyperparameter_name)
     if callable(value):
       return value()
-    if isinstance(value, (ops.Tensor, tf_variables.Variable)):
+    if isinstance(value, (ops.Tensor, tf_variables.Variable,
+                          distributed_values.TPUMirroredVariable)):
       return backend.get_value(value)
     return value
 
@@ -723,6 +746,16 @@ class OptimizerV2(checkpointable.CheckpointableBase):
     """
     raise NotImplementedError()
 
+  def _resource_scatter_add(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
+      return x.value()
+
+  def _resource_scatter_update(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_update(x.handle, i, v)]):
+      return x.value()
+
   # ---------------
   # For implementing the checkpointable interface
   # ---------------
@@ -866,8 +899,7 @@ def _var_key(var):
   """
 
   # pylint: disable=protected-access
-  if distribute_ctx.has_distribution_strategy() and hasattr(
-      var, "_primary_var"):
+  if distribute_ctx.has_strategy() and hasattr(var, "_primary_var"):
     var = var._primary_var
   if hasattr(var, "op"):
     return var._shared_name
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
index 42f9fcaea8ecab213029e56d56c5854b527ef95c..93a6a7e85e486aaeea19877c360b1b8413d5e141 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
@@ -18,38 +18,36 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-import tempfile
-
-from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import input_layer
-from tensorflow.python.keras.engine import saving
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.optimizer_v2 import adadelta
+from tensorflow.python.keras.optimizer_v2 import adagrad
 from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.keras.optimizer_v2 import adamax
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
-from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.keras.optimizer_v2 import nadam
+from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.training import momentum
 from tensorflow.python.training import training_util
@@ -262,23 +260,6 @@ class OptimizerTest(test.TestCase):
       self.evaluate(sgd.iterations.initializer)
       self.assertEqual(0, self.evaluate(sgd.iterations))
 
-  @test_util.run_in_graph_and_eager_modes
-  def testSerializationWithinDefun(self):
-    with self.cached_session():
-      sgd = gradient_descent.SGD(3.0)
-      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
-                                                    dtype=dtypes.float32)
-      loss = lambda: 3 * var0
-      sgd.minimize(loss, [var0])
-
-      def serialize():
-        config = sgd.get_config()
-        gradient_descent.SGD.from_config(config)
-
-      compiled_serialize = function.defun(serialize)
-      with self.assertRaisesRegexp(RuntimeError, 'inside Tensorflow graph'):
-        compiled_serialize()
-
   @test_util.run_in_graph_and_eager_modes
   def testConfig(self):
     with self.cached_session():
@@ -321,6 +302,16 @@ class OptimizerTest(test.TestCase):
       self.evaluate(opt_op)
       self.assertAllClose([0.], self.evaluate(var))
 
+  @test_util.run_in_graph_and_eager_modes
+  def testInvalidClipNorm(self):
+    with self.assertRaisesRegexp(ValueError, '>= 0'):
+      gradient_descent.SGD(learning_rate=1.0, clipnorm=-1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInvalidKwargs(self):
+    with self.assertRaisesRegexp(TypeError, 'Unexpected keyword argument'):
+      gradient_descent.SGD(learning_rate=1.0, invalidkwargs=1.0)
+
   @test_util.run_in_graph_and_eager_modes
   def testWeights(self):
     with self.cached_session():
@@ -396,6 +387,31 @@ class OptimizerTest(test.TestCase):
     with self.assertRaises(AttributeError):
       opt.not_an_attr += 3
 
+  @test_util.run_in_graph_and_eager_modes
+  def testGettingHyperParametersWithLrInConstructor(self):
+    opt = gradient_descent.SGD(lr=3.0)
+    var = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                 dtype=dtypes.float32)
+    loss = lambda: 3 * var
+    opt_op = opt.minimize(loss, [var])
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(opt_op)
+
+    self.assertTrue(isinstance(opt.lr, resource_variable_ops.ResourceVariable))
+    self.assertTrue(
+        isinstance(opt.learning_rate, resource_variable_ops.ResourceVariable))
+
+    lr = self.evaluate(opt.lr)
+    self.assertEqual(3.0, lr)
+
+    opt.lr = 2.0
+    lr = self.evaluate(opt.lr)
+    self.assertEqual(2.0, lr)
+
+    self.evaluate(opt.lr.assign(4.0))
+    lr = self.evaluate(opt.lr)
+    self.assertEqual(4.0, lr)
+
   @test_util.run_in_graph_and_eager_modes
   def testOptimizerWithKerasModel(self):
     a = input_layer.Input(shape=(3,), name='input_a')
@@ -475,25 +491,20 @@ class OptimizerTest(test.TestCase):
     opt.iterations = global_step
     var = resource_variable_ops.ResourceVariable([1.0, 2.0],
                                                  dtype=dtypes.float32)
+    self.evaluate(variables.global_variables_initializer())
+    init_step_value = self.evaluate(global_step)
     loss = lambda: 3 * var
     opt_op = opt.minimize(loss, [var])
     self.evaluate(variables.global_variables_initializer())
-    init_step_value = self.evaluate(global_step)
     self.evaluate(opt_op)
     new_step_value = self.evaluate(global_step)
     self.assertEqual(new_step_value, init_step_value + 1)
 
 
-class OptimizersCompatibilityTest(test.TestCase, parameterized.TestCase):
+@keras_parameterized.run_with_all_model_types
+class OptimizersCompatibilityTest(keras_parameterized.TestCase):
 
-  # TODO(tanzheny): remove test_numeric after algorithm for Momentum, Adam and
-  # NAdam has been unified: currently these three algorithms behave differently.
-  @parameterized.named_parameters(
-      ('adadelta', 'adadelta', True, True), ('adagrad', 'adagrad', True, True),
-      ('adam', 'adam', True, True), ('adamax', 'adamax', True, True),
-      ('nadam', 'nadam', True, False), ('momentum', 'momentum', True, True),
-      ('sgd', 'sgd', False, True))
-  def testOptimizersCompatibility(self, opt_str, test_weights, test_numeric):
+  def _testOptimizersCompatibility(self, opt_v1, opt_v2, test_weights=True):
     np.random.seed(1331)
     with self.cached_session():
       train_samples = 20
@@ -507,43 +518,63 @@ class OptimizersCompatibilityTest(test.TestCase, parameterized.TestCase):
       y = keras.utils.to_categorical(y)
 
       num_hidden = 5
-      model = testing_utils.get_small_sequential_mlp(
+      model_v1 = testing_utils.get_small_sequential_mlp(
           num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_v1.compile(opt_v1, loss='categorical_crossentropy', metrics=[])
+      model_v1.fit(x, y, batch_size=5, epochs=1)
 
-      old_mode = os.environ.get('TF2_BEHAVIOR', None)
-      # Disable tf2 to create V1 optimizer.
-      disable_tf2()
-      if opt_str == 'momentum':
-        opt_v1 = optimizers.SGD(momentum=0.9)
-      else:
-        opt_v1 = optimizers.get(opt_str)
-
-      # Test compile and fit with v1 optimizer.
-      model.compile(opt_v1, loss='categorical_crossentropy', metrics=[])
-      model.fit(x, y, batch_size=5, epochs=1)
-      model_dir = tempfile.mkdtemp()
-      gfile.MakeDirs(model_dir)
-      file_name = os.path.join(model_dir, 'model.h5')
-      model.save(file_name)
-
-      enable_tf2()
-      # Test load and fit with v2 optimizer.
-      model_2 = saving.load_model(file_name)
-      opt_v2 = model_2.optimizer
-      self.assertIsInstance(opt_v2, optimizer_v2.OptimizerV2)
-      # set_weights is called inside load_model but exception is swallowed,
-      # this call checks the weights can be set correctly.
+      model_v2 = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_v2.set_weights(model_v1.get_weights())
+      model_v2.compile(opt_v2, loss='categorical_crossentropy', metrics=[])
+      model_v2._make_train_function()
       if test_weights:
         opt_v2.set_weights(opt_v1.get_weights())
-      if test_numeric:
-        hist_1 = model.fit(x, y, batch_size=5, epochs=1, shuffle=False)
-        hist_2 = model_2.fit(x, y, batch_size=5, epochs=1, shuffle=False)
-        self.assertAllClose(model.get_weights(), model_2.get_weights())
-        self.assertAllClose(model.get_weights(), model_2.get_weights())
-        self.assertAllClose(hist_1.history['loss'], hist_2.history['loss'])
 
-      if old_mode is not None:
-        os.environ['TF2_BEHAVIOR'] = old_mode
+      hist_1 = model_v1.fit(x, y, batch_size=5, epochs=1, shuffle=False)
+      hist_2 = model_v2.fit(x, y, batch_size=5, epochs=1, shuffle=False)
+      self.assertAllClose(model_v1.get_weights(), model_v2.get_weights())
+      self.assertAllClose(hist_1.history['loss'], hist_2.history['loss'])
+
+  def testAdadeltaCompatibility(self):
+    opt_v1 = optimizers.Adadelta(lr=0.01)
+    opt_v2 = adadelta.Adadelta(learning_rate=0.01)
+    self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+  def testAdagradCompatibility(self):
+    opt_v1 = optimizers.Adagrad(lr=0.01)
+    opt_v2 = adagrad.Adagrad(learning_rate=0.01)
+    self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+  def testAdamCompatibility(self):
+    opt_v1 = optimizers.Adam()
+    opt_v2 = adam.Adam()
+    self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+  def testAdamaxCompatibility(self):
+    opt_v1 = optimizers.Adamax(lr=0.01)
+    opt_v2 = adamax.Adamax(learning_rate=0.01)
+    self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+  def testNadamCompatibility(self):
+    opt_v1 = optimizers.Nadam(lr=0.001)
+    opt_v2 = nadam.Nadam(learning_rate=0.001)
+    self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+  def testMomentumCompatibility(self):
+    opt_v1 = optimizers.SGD(lr=0.01, momentum=0.9)
+    opt_v2 = gradient_descent.SGD(learning_rate=0.01, momentum=0.9)
+    self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+  def testRMSpropCompatibility(self):
+    opt_v1 = optimizers.RMSprop()
+    opt_v2 = rmsprop.RMSprop()
+    self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+  def testSGDCompatibility(self):
+    opt_v1 = optimizers.SGD(lr=0.01)
+    opt_v2 = gradient_descent.SGD(learning_rate=0.01)
+    self._testOptimizersCompatibility(opt_v1, opt_v2, False)
 
   def testNumericEquivalenceForNesterovMomentum(self):
     np.random.seed(1331)
@@ -621,15 +652,6 @@ class OptimizersCompatibilityTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(hist_k_v1.history['loss'], hist_k_v2.history['loss'])
 
 
-def disable_tf2():
-  if 'TF2_BEHAVIOR' in os.environ:
-    del os.environ['TF2_BEHAVIOR']
-
-
-def enable_tf2():
-  os.environ['TF2_BEHAVIOR'] = 'enabled'
-
-
 # Note: These tests are kept in a separate class to avoid bugs in some
 # distributions of Python that break AutoGraph which is used by tf.function.
 class OptimizerWithFunctionTest(test.TestCase):
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop.py b/tensorflow/python/keras/optimizer_v2/rmsprop.py
index b52ac4524676bd5f92e56317387b501984fc1ae1..83b81cbe5d458fbec14d1af45f84c32604dbe187 100644
--- a/tensorflow/python/keras/optimizer_v2/rmsprop.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop.py
@@ -17,8 +17,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import ops
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import keras_export
 
@@ -90,7 +96,11 @@ class RMSprop(optimizer_v2.OptimizerV2):
         `epsilon` can each be a callable that takes no arguments and returns the
         actual value to use. This can be useful for changing these values across
         different invocations of optimizer functions. @end_compatibility
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
     """
     super(RMSprop, self).__init__(name, **kwargs)
     self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
@@ -110,77 +120,122 @@ class RMSprop(optimizer_v2.OptimizerV2):
   def _create_slots(self, var_list):
     for var in var_list:
       self.add_slot(var, "rms")
-      self.add_slot(var, "momentum")
-      if self.centered:
+    if self._momentum:
+      for var in var_list:
+        self.add_slot(var, "momentum")
+    if self.centered:
+      for var in var_list:
         self.add_slot(var, "mg")
 
   def _resource_apply_dense(self, grad, var):
     var_dtype = var.dtype.base_dtype
     lr_t = self._decayed_lr(var_dtype)
     rms = self.get_slot(var, "rms")
-    mom = self.get_slot(var, "momentum")
     rho = self._get_hyper("rho", var_dtype)
     momentum = self._get_hyper("momentum", var_dtype)
     epsilon = self._get_hyper("epsilon", var_dtype)
-    if self.centered:
-      mg = self.get_slot(var, "mg")
-      return training_ops.resource_apply_centered_rms_prop(
-          var.handle,
-          mg.handle,
-          rms.handle,
-          mom.handle,
-          lr_t,
-          rho,
-          momentum,
-          epsilon,
-          grad,
-          use_locking=self._use_locking)
+    if self._momentum:
+      mom = self.get_slot(var, "momentum")
+      if self.centered:
+        mg = self.get_slot(var, "mg")
+        return training_ops.resource_apply_centered_rms_prop(
+            var.handle,
+            mg.handle,
+            rms.handle,
+            mom.handle,
+            lr_t,
+            rho,
+            momentum,
+            epsilon,
+            grad,
+            use_locking=self._use_locking)
+      else:
+        return training_ops.resource_apply_rms_prop(
+            var.handle,
+            rms.handle,
+            mom.handle,
+            lr_t,
+            rho,
+            momentum,
+            epsilon,
+            grad,
+            use_locking=self._use_locking)
     else:
-      return training_ops.resource_apply_rms_prop(
-          var.handle,
-          rms.handle,
-          mom.handle,
-          lr_t,
-          rho,
-          momentum,
-          epsilon,
-          grad,
-          use_locking=self._use_locking)
+      rms_t = rho * rms + (1. - rho) * math_ops.square(grad)
+      rms_t = state_ops.assign(rms, rms_t, use_locking=self._use_locking)
+      denom_t = rms_t
+      if self.centered:
+        mg = self.get_slot(var, "mg")
+        mg_t = rho * mg + (1. - rho) * grad
+        mg_t = state_ops.assign(mg, mg_t, use_locking=self._use_locking)
+        denom_t = rms_t - math_ops.square(mg_t)
+      var_t = var - lr_t * grad / (math_ops.sqrt(denom_t) + epsilon)
+      return state_ops.assign(var, var_t, use_locking=self._use_locking).op
 
   def _resource_apply_sparse(self, grad, var, indices):
     var_dtype = var.dtype.base_dtype
     lr_t = self._decayed_lr(var_dtype)
     rms = self.get_slot(var, "rms")
-    mom = self.get_slot(var, "momentum")
     rho = self._get_hyper("rho", var_dtype)
     momentum = self._get_hyper("momentum", var_dtype)
     epsilon = self._get_hyper("epsilon", var_dtype)
-    if self.centered:
-      mg = self.get_slot(var, "mg")
-      return training_ops.resource_sparse_apply_centered_rms_prop(
-          var.handle,
-          mg.handle,
-          rms.handle,
-          mom.handle,
-          lr_t,
-          rho,
-          momentum,
-          epsilon,
-          grad,
-          indices,
-          use_locking=self._use_locking)
+    if self._momentum:
+      mom = self.get_slot(var, "momentum")
+      if self.centered:
+        mg = self.get_slot(var, "mg")
+        return training_ops.resource_sparse_apply_centered_rms_prop(
+            var.handle,
+            mg.handle,
+            rms.handle,
+            mom.handle,
+            lr_t,
+            rho,
+            momentum,
+            epsilon,
+            grad,
+            indices,
+            use_locking=self._use_locking)
+      else:
+        return training_ops.resource_sparse_apply_rms_prop(
+            var.handle,
+            rms.handle,
+            mom.handle,
+            lr_t,
+            rho,
+            momentum,
+            epsilon,
+            grad,
+            indices,
+            use_locking=self._use_locking)
     else:
-      return training_ops.resource_sparse_apply_rms_prop(
-          var.handle,
-          rms.handle,
-          mom.handle,
-          lr_t,
-          rho,
-          momentum,
-          epsilon,
-          grad,
-          indices,
-          use_locking=self._use_locking)
+      rms_scaled_g_values = (grad * grad) * (1. - rho)
+      rms_t = state_ops.assign(rms, rms * rho, use_locking=self._use_locking)
+      with ops.control_dependencies([rms_t]):
+        rms_t = self._resource_scatter_add(rms, indices, rms_scaled_g_values)
+        rms_slice = array_ops.gather(rms_t, indices)
+      denom_slice = rms_slice
+      if self.centered:
+        mg = self.get_slot(var, "mg")
+        mg_scaled_g_values = grad * (1. - rho)
+        mg_t = state_ops.assign(mg, mg * rho, use_locking=self._use_locking)
+        with ops.control_dependencies([mg_t]):
+          mg_t = self._resource_scatter_add(mg, indices, mg_scaled_g_values)
+          mg_slice = array_ops.gather(mg_t, indices)
+          denom_slice = rms_slice - math_ops.square(mg_slice)
+      var_update = self._resource_scatter_add(
+          var, indices, -lr_t * grad / (math_ops.sqrt(denom_slice) + epsilon))
+      if self.centered:
+        return control_flow_ops.group(*[var_update, rms_t, mg_t])
+      return control_flow_ops.group(*[var_update, rms_t])
+
+  def set_weights(self, weights):
+    params = self.weights
+    # Override set_weights for backward compatibility of Keras V1 optimizer
+    # since it does not include iteration at head of the weight list. Set
+    # iteration to 0.
+    if len(params) == len(weights) + 1:
+      weights = [np.array(0)] + weights
+    super(RMSprop, self).set_weights(weights)
 
   def get_config(self):
     config = super(RMSprop, self).get_config()
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
index 4d61cfbbc52789db172445f9286fdb848c0a7bc6..67662aae201f7895632491882dcc87c0732cd42d 100644
--- a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
@@ -58,14 +58,18 @@ class RMSpropOptimizerTest(test.TestCase):
   def _rmsprop_update_numpy(self, var, g, mg, rms, mom, lr, rho, momentum,
                             epsilon, centered):
     rms_t = rms * rho + (1 - rho) * g * g
-    denom_t = rms_t + epsilon
     if centered:
       mg_t = mg * rho + (1 - rho) * g
-      denom_t -= mg_t * mg_t
+      denom_t = rms_t - mg_t * mg_t
     else:
       mg_t = mg
-    mom_t = momentum * mom + lr * g / np.sqrt(denom_t, dtype=denom_t.dtype)
-    var_t = var - mom_t
+      denom_t = rms_t
+    if momentum > 0.:
+      mom_t = momentum * mom + lr * g / (np.sqrt(denom_t + epsilon))
+      var_t = var - mom_t
+    else:
+      mom_t = mom
+      var_t = var - lr * g / (np.sqrt(denom_t) + epsilon)
     return var_t, mg_t, rms_t, mom_t
 
   def _sparse_rmsprop_update_numpy(self, var, gindexs, gvalues, mg, rms, mom,
@@ -78,12 +82,18 @@ class RMSpropOptimizerTest(test.TestCase):
       gindex = gindexs[i]
       gvalue = gvalues[i]
       rms_t[gindex] = rms[gindex] * rho + (1 - rho) * gvalue * gvalue
-      denom_t = rms_t[gindex] + epsilon
       if centered:
         mg_t[gindex] = mg_t[gindex] * rho + (1 - rho) * gvalue
-        denom_t -= mg_t[gindex] * mg_t[gindex]
-      mom_t[gindex] = momentum * mom[gindex] + lr * gvalue / np.sqrt(denom_t)
-      var_t[gindex] = var[gindex] - mom_t[gindex]
+        denom_t = rms_t[gindex] - mg_t[gindex] * mg_t[gindex]
+      else:
+        denom_t = rms_t[gindex]
+      if momentum > 0.:
+        mom_t[gindex] = momentum * mom[gindex] + lr * gvalue / np.sqrt(denom_t +
+                                                                       epsilon)
+        var_t[gindex] = var[gindex] - mom_t[gindex]
+      else:
+        mom_t[gindex] = mom[gindex]
+        var_t[gindex] = var[gindex] - lr * gvalue / (np.sqrt(denom_t) + epsilon)
     return var_t, mg_t, rms_t, mom_t
 
   @test_util.run_deprecated_v1
@@ -117,14 +127,17 @@ class RMSpropOptimizerTest(test.TestCase):
           mg0 = None
           mg1 = None
 
+        if momentum > 0.:
+          mom0 = opt.get_slot(var0, "momentum")
+          mom1 = opt.get_slot(var1, "momentum")
+        else:
+          mom0 = None
+          mom1 = None
+
         rms0 = opt.get_slot(var0, "rms")
         self.assertTrue(rms0 is not None)
         rms1 = opt.get_slot(var1, "rms")
         self.assertTrue(rms1 is not None)
-        mom0 = opt.get_slot(var0, "momentum")
-        self.assertTrue(mom0 is not None)
-        mom1 = opt.get_slot(var1, "momentum")
-        self.assertTrue(mom1 is not None)
 
         mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
         mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
@@ -137,8 +150,8 @@ class RMSpropOptimizerTest(test.TestCase):
         self.assertAllClose([1.0, 2.0], self.evaluate(var0))
         self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
-        # Run 4 steps of RMSprop
-        for _ in range(1, 5):
+        # Run 3 steps of RMSprop
+        for _ in range(1, 4):
           self.evaluate(update)
 
           var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
@@ -152,10 +165,11 @@ class RMSpropOptimizerTest(test.TestCase):
           if centered:
             self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0))
             self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
+          if momentum > 0.:
+            self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+            self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
           self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
           self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
-          self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
-          self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
           self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
           self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
@@ -191,10 +205,12 @@ class RMSpropOptimizerTest(test.TestCase):
     self.assertTrue(rms0 is not None)
     rms1 = opt.get_slot(var1, "rms")
     self.assertTrue(rms1 is not None)
-    mom0 = opt.get_slot(var0, "momentum")
-    self.assertTrue(mom0 is not None)
-    mom1 = opt.get_slot(var1, "momentum")
-    self.assertTrue(mom1 is not None)
+    if momentum > 0.:
+      mom0 = opt.get_slot(var0, "momentum")
+      mom1 = opt.get_slot(var1, "momentum")
+    else:
+      mom0 = None
+      mom1 = None
 
     mg0_np = np.array([0.0, 0.0])
     mg1_np = np.array([0.0, 0.0])
@@ -222,8 +238,9 @@ class RMSpropOptimizerTest(test.TestCase):
       # Validate updated params
       self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
       self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
-      self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
-      self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+      if momentum > 0.:
+        self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+        self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
       self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
       self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
@@ -325,10 +342,12 @@ class RMSpropOptimizerTest(test.TestCase):
         self.assertTrue(rms0 is not None)
         rms1 = opt.get_slot(var1, "rms")
         self.assertTrue(rms1 is not None)
-        mom0 = opt.get_slot(var0, "momentum")
-        self.assertTrue(mom0 is not None)
-        mom1 = opt.get_slot(var1, "momentum")
-        self.assertTrue(mom1 is not None)
+        if momentum > 0.:
+          mom0 = opt.get_slot(var0, "momentum")
+          mom1 = opt.get_slot(var1, "momentum")
+        else:
+          mom0 = None
+          mom1 = None
 
         mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
         mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
@@ -341,8 +360,8 @@ class RMSpropOptimizerTest(test.TestCase):
         self.assertAllClose([1.0, 2.0], self.evaluate(var0))
         self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
-        # Run 4 steps of RMSprop
-        for _ in range(1, 5):
+        # Run 3 steps of RMSprop
+        for _ in range(1, 4):
           self.evaluate(update)
 
           var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy(
@@ -358,8 +377,9 @@ class RMSpropOptimizerTest(test.TestCase):
             self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
           self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
           self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
-          self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
-          self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+          if momentum > 0.:
+            self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+            self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
           self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
           self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
@@ -420,6 +440,32 @@ class RMSpropOptimizerTest(test.TestCase):
     opt_3 = rmsprop.RMSprop(learning_rate=0.1)
     self.assertEqual(opt_3.lr, 0.1)
 
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = variables.Variable(1.)
+      v2 = variables.Variable(1.)
+
+      opt = rmsprop.RMSprop(1., momentum=0., centered=False)
+      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+      # There should be iteration, and one unique slot variable for v1 and v2.
+      self.assertEqual(3, len(set(opt.variables())))
+      self.assertEqual(
+          self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
+
+      opt = rmsprop.RMSprop(learning_rate=1., momentum=0.2, centered=False)
+      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+      # There should be iteration, and two unique slot variables for v1 and v2.
+      self.assertEqual(5, len(set(opt.variables())))
+      self.assertEqual(
+          self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
+
+      opt = rmsprop.RMSprop(learning_rate=1., momentum=0.2, centered=True)
+      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+      # There should be iteration, and three unique slot variables for v1 and v2
+      self.assertEqual(7, len(set(opt.variables())))
+      self.assertEqual(
+          self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/optimizers.py b/tensorflow/python/keras/optimizers.py
index bcece417973826b120e84233e00972ceaef5e1fe..82fb555b57e8ebb7f0657e1d4a317403714c4ab9 100644
--- a/tensorflow/python/keras/optimizers.py
+++ b/tensorflow/python/keras/optimizers.py
@@ -575,7 +575,7 @@ class Adamax(Optimizer):
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    self.updates = [state_ops.assign_add(self.iterations, 1)]
+    self.updates = []
 
     lr = self.lr
     if self.initial_decay > 0:
@@ -583,7 +583,8 @@ class Adamax(Optimizer):
           1. / (1. + self.decay * math_ops.cast(self.iterations,
                                                 K.dtype(self.decay))))
 
-    t = math_ops.cast(self.iterations, K.floatx()) + 1
+    with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
+      t = math_ops.cast(self.iterations, K.floatx())
     lr_t = lr / (1. - math_ops.pow(self.beta_1, t))
 
     shapes = [K.int_shape(p) for p in params]
@@ -622,7 +623,7 @@ class Adamax(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@keras_export('keras.optimizers.Nadam')
+@keras_export(v1=['keras.optimizers.Nadam'])
 class Nadam(Optimizer):
   """Nesterov Adam optimizer.
 
@@ -739,7 +740,7 @@ class TFOptimizer(Optimizer, checkpointable.CheckpointableBase):
     return self.optimizer.compute_gradients(loss, params)
 
   def get_updates(self, loss, params):
-    if distribution_strategy_context.has_distribution_strategy():
+    if distribution_strategy_context.has_strategy():
       self.updates = []
 
       if not params:
diff --git a/tensorflow/python/keras/optimizers_test.py b/tensorflow/python/keras/optimizers_test.py
index 18a20567ce9db90725a1cb05c34ae6baeacbcd7c..de24bf0536fd60645bb3e8a60e6d9ace1df6b0b9 100644
--- a/tensorflow/python/keras/optimizers_test.py
+++ b/tensorflow/python/keras/optimizers_test.py
@@ -44,116 +44,119 @@ def _get_model(input_dim, num_hidden, output_dim):
   return model
 
 
-def _test_optimizer(optimizer, target=0.75):
-  np.random.seed(1337)
-  (x_train, y_train), _ = testing_utils.get_test_data(train_samples=1000,
-                                                      test_samples=200,
-                                                      input_shape=(10,),
-                                                      num_classes=2)
-  y_train = keras.utils.to_categorical(y_train)
-  model = _get_model(x_train.shape[1], 20, y_train.shape[1])
-  model.compile(loss='categorical_crossentropy',
-                optimizer=optimizer,
-                metrics=['accuracy'])
-  np.testing.assert_equal(keras.backend.get_value(model.optimizer.iterations),
-                          0)
-  history = model.fit(x_train, y_train, epochs=2, batch_size=16, verbose=0)
-  np.testing.assert_equal(keras.backend.get_value(model.optimizer.iterations),
-                          126)  # 63 steps per epoch
-  assert history.history['acc'][-1] >= target
-  config = keras.optimizers.serialize(optimizer)
-  optim = keras.optimizers.deserialize(config)
-  new_config = keras.optimizers.serialize(optim)
-  new_config['class_name'] = new_config['class_name'].lower()
-  new_config['config'].pop('name', None)
-  if 'amsgrad' not in config['config']:
-    new_config['config'].pop('amsgrad', None)
-  if 'decay' in new_config['config'] and 'schedule_decay' in config['config']:
-    new_config['config']['schedule_decay'] = new_config['config'].pop('decay')
-  if 'momentum' not in config['config']:
-    new_config['config'].pop('momentum', None)
-  if 'centered' not in config['config']:
-    new_config['config'].pop('centered', None)
-  assert config == new_config
-
-  # Test constraints.
-  model = keras.models.Sequential()
-  dense = keras.layers.Dense(10,
-                             input_shape=(x_train.shape[1],),
-                             kernel_constraint=lambda x: 0. * x + 1.,
-                             bias_constraint=lambda x: 0. * x + 2.,
-                             activation='relu')
-  model.add(dense)
-  model.add(keras.layers.Dense(y_train.shape[1], activation='softmax'))
-  model.compile(loss='categorical_crossentropy',
-                optimizer=optimizer,
-                metrics=['accuracy'])
-  np.testing.assert_equal(keras.backend.get_value(model.optimizer.iterations),
-                          126)  # Using same optimizer from before
-  model.train_on_batch(x_train[:10], y_train[:10])
-  np.testing.assert_equal(keras.backend.get_value(model.optimizer.iterations),
-                          127)
-  kernel, bias = dense.get_weights()
-  np.testing.assert_allclose(kernel, 1., atol=1e-3)
-  np.testing.assert_allclose(bias, 2., atol=1e-3)
-
-
 class KerasOptimizersTest(test.TestCase):
 
+  def _test_optimizer(self, optimizer, target=0.75):
+    np.random.seed(1337)
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=1000, test_samples=200, input_shape=(10,), num_classes=2)
+    y_train = keras.utils.to_categorical(y_train)
+    model = _get_model(x_train.shape[1], 20, y_train.shape[1])
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=optimizer,
+        metrics=['accuracy'])
+    np.testing.assert_equal(
+        keras.backend.get_value(model.optimizer.iterations), 0)
+    history = model.fit(x_train, y_train, epochs=2, batch_size=16, verbose=0)
+    np.testing.assert_equal(
+        keras.backend.get_value(model.optimizer.iterations),
+        126)  # 63 steps per epoch
+    assert history.history['acc'][-1] >= target
+    config = keras.optimizers.serialize(optimizer)
+    optim = keras.optimizers.deserialize(config)
+    new_config = keras.optimizers.serialize(optim)
+    new_config['class_name'] = new_config['class_name'].lower()
+    new_config['config'].pop('name', None)
+    if 'amsgrad' not in config['config']:
+      new_config['config'].pop('amsgrad', None)
+    if 'decay' in new_config['config'] and 'schedule_decay' in config['config']:
+      new_config['config']['schedule_decay'] = new_config['config'].pop('decay')
+    if 'momentum' not in config['config']:
+      new_config['config'].pop('momentum', None)
+    if 'centered' not in config['config']:
+      new_config['config'].pop('centered', None)
+    self.assertDictEqual(config, new_config)
+
+    # Test constraints.
+    model = keras.models.Sequential()
+    dense = keras.layers.Dense(
+        10,
+        input_shape=(x_train.shape[1],),
+        kernel_constraint=lambda x: 0. * x + 1.,
+        bias_constraint=lambda x: 0. * x + 2.,
+        activation='relu')
+    model.add(dense)
+    model.add(keras.layers.Dense(y_train.shape[1], activation='softmax'))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=optimizer,
+        metrics=['accuracy'])
+    np.testing.assert_equal(
+        keras.backend.get_value(model.optimizer.iterations),
+        126)  # Using same optimizer from before
+    model.train_on_batch(x_train[:10], y_train[:10])
+    np.testing.assert_equal(
+        keras.backend.get_value(model.optimizer.iterations), 127)
+    kernel, bias = dense.get_weights()
+    np.testing.assert_allclose(kernel, 1., atol=1e-3)
+    np.testing.assert_allclose(bias, 2., atol=1e-3)
+
   def test_sgd(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.SGD(lr=0.01,
-                                           momentum=0.9,
-                                           nesterov=True))
+      self._test_optimizer(keras.optimizers.SGD())
+
+  def test_momentum(self):
+    with self.cached_session():
+      self._test_optimizer(
+          keras.optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True))
 
   def test_rmsprop(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.RMSprop())
-      _test_optimizer(keras.optimizers.RMSprop(decay=1e-3))
+      self._test_optimizer(keras.optimizers.RMSprop())
+      self._test_optimizer(keras.optimizers.RMSprop(decay=1e-3))
 
   def test_adagrad(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.Adagrad())
-      _test_optimizer(keras.optimizers.Adagrad(decay=1e-3))
+      self._test_optimizer(keras.optimizers.Adagrad())
+      self._test_optimizer(keras.optimizers.Adagrad(decay=1e-3))
 
   def test_adadelta(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.Adadelta(), target=0.6)
+      self._test_optimizer(keras.optimizers.Adadelta(), target=0.6)
       # Accuracy seems dependent on the initialization. Even adding tf.Print
       # nodes in the graph seemed to affect the initialization seed, and hence
       # the accuracy.
-      _test_optimizer(keras.optimizers.Adadelta(decay=1e-3), target=0.4)
+      self._test_optimizer(keras.optimizers.Adadelta(decay=1e-3), target=0.4)
 
   def test_adam(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.Adam())
+      self._test_optimizer(keras.optimizers.Adam())
       # Accuracy seems dependent on the seed initialization.
       # TODO(b/121051441): fix test flakiness.
-      _test_optimizer(keras.optimizers.Adam(decay=1e-3), target=0.73)
-      _test_optimizer(keras.optimizers.Adam(amsgrad=True))
+      self._test_optimizer(keras.optimizers.Adam(decay=1e-3), target=0.73)
+      self._test_optimizer(keras.optimizers.Adam(amsgrad=True))
 
   def test_adamax(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.Adamax())
-      _test_optimizer(keras.optimizers.Adamax(decay=1e-3))
+      self._test_optimizer(keras.optimizers.Adamax())
+      self._test_optimizer(keras.optimizers.Adamax(decay=1e-3))
 
   def test_nadam(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.Nadam())
+      self._test_optimizer(keras.optimizers.Nadam())
 
   def test_clipnorm(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.SGD(lr=0.01,
-                                           momentum=0.9,
-                                           clipnorm=0.5))
+      self._test_optimizer(
+          keras.optimizers.SGD(lr=0.01, momentum=0.9, clipnorm=0.5))
 
   def test_clipvalue(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.SGD(lr=0.01,
-                                           momentum=0.9,
-                                           clipvalue=0.5))
+      self._test_optimizer(
+          keras.optimizers.SGD(lr=0.01, momentum=0.9, clipvalue=0.5))
 
-  def test_tfoptimizer(self):
+  def test_tf_optimizer(self):
     optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01))
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(
@@ -188,7 +191,7 @@ class KerasOptimizersTest(test.TestCase):
     self.assertIs(optimizer_weak(), None)
 
   @test_util.run_in_graph_and_eager_modes
-  def test_tfoptimizer_iterations(self):
+  def test_tf_optimizer_iterations(self):
     with self.cached_session():
       optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01))
       model = keras.models.Sequential()
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index fd062b0ab337aa6fa62a7603a36749cde315c3da..6448f87d257e99d5cb38b95ad93b885ea87202ae 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -25,6 +25,13 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.optimizer_v2 import adadelta as adadelta_v2
+from tensorflow.python.keras.optimizer_v2 import adagrad as adagrad_v2
+from tensorflow.python.keras.optimizer_v2 import adam as adam_v2
+from tensorflow.python.keras.optimizer_v2 import adamax as adamax_v2
+from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_v2
+from tensorflow.python.keras.optimizer_v2 import nadam as nadam_v2
+from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_v2
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
@@ -355,11 +362,20 @@ class _SubclassModel(keras.Model):
 
   def __init__(self, layers):
     super(_SubclassModel, self).__init__()
-    self.all_layers = layers
+    # Note that clone and build doesn't support lists of layers in subclassed
+    # models. Adding each layer directly here.
+    for i, layer in enumerate(layers):
+      setattr(self, self._layer_name_for_i(i), layer)
+
+    self.num_layers = len(layers)
+
+  def _layer_name_for_i(self, i):
+    return 'layer{}'.format(i)
 
   def call(self, inputs, **kwargs):
     x = inputs
-    for layer in self.all_layers:
+    for i in range(self.num_layers):
+      layer = getattr(self, self._layer_name_for_i(i))
       x = layer(x)
     return x
 
@@ -626,3 +642,39 @@ def get_multi_io_model(
     return keras.Model(inputs, outputs)
 
   raise ValueError('Unknown model type {}'.format(model_type))
+
+
+_V2_OPTIMIZER_MAP = {
+    'adadelta': adadelta_v2.Adadelta,
+    'adagrad': adagrad_v2.Adagrad,
+    'adam': adam_v2.Adam,
+    'adamax': adamax_v2.Adamax,
+    'nadam': nadam_v2.Nadam,
+    'rmsprop': rmsprop_v2.RMSprop,
+    'sgd': gradient_descent_v2.SGD
+}
+
+
+def get_v2_optimizer(name, **kwargs):
+  """Get the v2 optimizer requested.
+
+  This is only necessary until v2 are the default, as we are testing in Eager,
+  and Eager + v1 optimizers fail tests. When we are in v2, the strings alone
+  should be sufficient, and this mapping can theoretically be removed.
+
+  Args:
+    name: string name of Keras v2 optimizer.
+    **kwargs: any kwargs to pass to the optimizer constructor.
+
+  Returns:
+    Initialized Keras v2 optimizer.
+
+  Raises:
+    ValueError: if an unknown name was passed.
+  """
+  try:
+    return _V2_OPTIMIZER_MAP[name](**kwargs)
+  except KeyError:
+    raise ValueError(
+        'Could not find requested v2 optimizer: {}\nValid choices: {}'.format(
+            name, list(_V2_OPTIMIZER_MAP.keys())))
diff --git a/tensorflow/python/keras/utils/conv_utils.py b/tensorflow/python/keras/utils/conv_utils.py
index f486e631e50e5beb8da606879f23cd67131389f5..ea7427f61a8cc234f69df28d111d26b87b326a48 100644
--- a/tensorflow/python/keras/utils/conv_utils.py
+++ b/tensorflow/python/keras/utils/conv_utils.py
@@ -194,9 +194,11 @@ def normalize_data_format(value):
 
 
 def normalize_padding(value):
+  if isinstance(value, (list, tuple)):
+    return value
   padding = value.lower()
   if padding not in {'valid', 'same', 'causal'}:
-    raise ValueError('The `padding` argument must be one of '
+    raise ValueError('The `padding` argument must be a list/tuple or one of '
                      '"valid", "same" (or "causal", only for `Conv1D). '
                      'Received: ' + str(padding))
   return padding
diff --git a/tensorflow/python/keras/utils/layer_utils.py b/tensorflow/python/keras/utils/layer_utils.py
index ead5afd1ae6e1ef97ee5f8dbefbf6179e28343fc..1d85e8a25fe8e39f0edc22012c4a62f9ef14a058 100644
--- a/tensorflow/python/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/utils/layer_utils.py
@@ -196,7 +196,7 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
         continue
 
       for inbound_layer, node_index, tensor_index, _ in node.iterate_inbound():
-        connections.append('{}[{}][{}]'.format(inbound_layer, node_index,
+        connections.append('{}[{}][{}]'.format(inbound_layer.name, node_index,
                                                tensor_index))
 
     name = layer.name
diff --git a/tensorflow/python/keras/utils/losses_utils.py b/tensorflow/python/keras/utils/losses_utils.py
index fc4b4ac7dfd0966af5f4c21d4b78ba8ecd6bf46a..d42b354fb140bc592ee1127c3789069365371bc4 100644
--- a/tensorflow/python/keras/utils/losses_utils.py
+++ b/tensorflow/python/keras/utils/losses_utils.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.ops import array_ops
@@ -51,10 +52,31 @@ def squeeze_or_expand_dimensions(y_pred, y_true, sample_weight):
     the last dimension squeezed,
     `sample_weight` could be extended by one dimension.
   """
+  y_pred_shape = y_pred.get_shape()
+  y_pred_rank = y_pred_shape.ndims
   if y_true is not None:
-    # squeeze last dim of `y_pred` or `y_true` if their rank differs by 1
-    y_true, y_pred = confusion_matrix.remove_squeezable_dimensions(
-        y_true, y_pred)
+
+    # If sparse matrix is provided as `y_true`, the last dimension in `y_pred`
+    # may be > 1. Eg: y_true = [0, 1, 2] (shape=(3,)),
+    # y_pred = [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]] (shape=(3, 3))
+    # In this case, we should not try to remove squeezable dimension.
+    y_true_shape = y_true.get_shape()
+    y_true_rank = y_true_shape.ndims
+    if (y_true_rank is not None) and (y_pred_rank is not None):
+      # Use static rank for `y_true` and `y_pred`.
+      if (y_pred_rank - y_true_rank != 1) or y_pred_shape[-1] == 1:
+        y_true, y_pred = confusion_matrix.remove_squeezable_dimensions(
+            y_true, y_pred)
+    else:
+      # Use dynamic rank.
+      rank_diff = array_ops.rank(y_pred) - array_ops.rank(y_true)
+      squeeze_dims = lambda: confusion_matrix.remove_squeezable_dimensions(  # pylint: disable=g-long-lambda
+          y_true, y_pred)
+      is_last_dim_1 = math_ops.equal(1, array_ops.shape(y_pred)[-1])
+      maybe_squeeze_dims = lambda: control_flow_ops.cond(  # pylint: disable=g-long-lambda
+          is_last_dim_1, squeeze_dims, lambda: (y_true, y_pred))
+      y_true, y_pred = control_flow_ops.cond(
+          math_ops.equal(1, rank_diff), maybe_squeeze_dims, squeeze_dims)
 
   if sample_weight is None:
     return y_pred, y_true, None
@@ -65,8 +87,6 @@ def squeeze_or_expand_dimensions(y_pred, y_true, sample_weight):
   if weights_rank == 0:  # If weights is scalar, do nothing.
     return y_pred, y_true, sample_weight
 
-  y_pred_shape = y_pred.get_shape()
-  y_pred_rank = y_pred_shape.ndims
   if (y_pred_rank is not None) and (weights_rank is not None):
     # Use static rank.
     if weights_rank - y_pred_rank == 1:
@@ -167,8 +187,8 @@ def compute_weighted_loss(losses,
         losses, None, sample_weight)
     losses = ops.convert_to_tensor(losses)
     input_dtype = losses.dtype
-    losses = math_ops.to_float(losses)
-    sample_weight = math_ops.to_float(sample_weight)
+    losses = math_ops.cast(losses, dtypes.float32)
+    sample_weight = math_ops.cast(sample_weight, dtypes.float32)
 
     try:
       # Broadcast weights if possible.
diff --git a/tensorflow/python/kernel_tests/bias_op_test.py b/tensorflow/python/kernel_tests/bias_op_test.py
index 66f442dbddb5f609e7525ba0db9809dc3943ee25..8d04da6dbd7ca6c548349c047c7c4980a04560c6 100644
--- a/tensorflow/python/kernel_tests/bias_op_test.py
+++ b/tensorflow/python/kernel_tests/bias_op_test.py
@@ -196,9 +196,7 @@ class BiasAddTest(test.TestCase):
       self.assertAllClose(grad_jacob_t, grad_jacob_n, threshold, threshold)
 
   @test_util.run_deprecated_v1
-  def testGradientTensor(self):
-    # TODO(yongtang): BiasAddGrad with NCHW only works 4D. Reenable once
-    # all dimensions are supported.
+  def testGradientTensor2D(self):
     for (data_format, use_gpu) in ("NHWC", False), ("NHWC", True):
       for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
         np_input = np.array(
@@ -207,9 +205,18 @@ class BiasAddTest(test.TestCase):
         bias = np.array([1.3, 2.4], dtype=dtype.as_numpy_dtype)
         self._testGradient(np_input, bias, dtype, data_format, use_gpu)
 
+  @test_util.run_deprecated_v1
+  def testGradientTensor3D(self):
+    for (data_format, use_gpu) in [("NHWC", False), ("NHWC", True),
+                                   ("NCHW", False), ("NCHW", True)]:
+      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
+        np_input = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
+                            dtype=dtype.as_numpy_dtype).reshape(1, 3, 2)
+        bias = np.array([1.3, 2.4], dtype=dtype.as_numpy_dtype)
+        self._testGradient(np_input, bias, dtype, data_format, use_gpu)
+
   @test_util.run_deprecated_v1
   def testGradientTensor4D(self):
-    # BiasAddGrad with NCHW support 4D so all are enabled.
     for (data_format, use_gpu) in [("NHWC", False), ("NHWC", True),
                                    ("NCHW", False), ("NCHW", True)]:
       for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
@@ -219,6 +226,17 @@ class BiasAddTest(test.TestCase):
         bias = np.array([1.3, 2.4], dtype=dtype.as_numpy_dtype)
         self._testGradient(np_input, bias, dtype, data_format, use_gpu)
 
+  @test_util.run_deprecated_v1
+  def testGradientTensor5D(self):
+    for (data_format, use_gpu) in [("NHWC", False), ("NHWC", True),
+                                   ("NCHW", False), ("NCHW", True)]:
+      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
+        np_input = np.arange(
+            1.0, 49.0, dtype=dtype.as_numpy_dtype).reshape(
+                [1, 2, 3, 4, 2]).astype(np.float32)
+        bias = np.array([1.3, 2.4], dtype=dtype.as_numpy_dtype)
+        self._testGradient(np_input, bias, dtype, data_format, use_gpu)
+
   @test_util.run_deprecated_v1
   def testEmpty(self):
     np.random.seed(7)
@@ -227,10 +245,15 @@ class BiasAddTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testEmptyGradient(self):
-    # TODO(yongtang): BiasAddGrad with NCHW only works 4D. Reenable once
-    # all dimensions are supported.
     for (data_format, use_gpu) in ("NHWC", False), ("NHWC", True):
-      for shape in (0, 0), (2, 0), (0, 2), (4, 3, 0), (4, 0, 3), (0, 4, 3):
+      for shape in (0, 0), (2, 0), (0, 2):
+        self._testGradient(
+            np.random.randn(*shape), np.random.randn(shape[-1]), dtypes.float64,
+            data_format, use_gpu)
+
+    for (data_format, use_gpu) in [("NHWC", False), ("NHWC", True),
+                                   ("NCHW", False), ("NCHW", True)]:
+      for shape in (4, 3, 0), (4, 0, 3), (0, 4, 3):
         self._testGradient(
             np.random.randn(*shape),
             np.random.randn(shape[-1]), dtypes.float64, data_format, use_gpu)
diff --git a/tensorflow/python/kernel_tests/cond_v2_test.py b/tensorflow/python/kernel_tests/cond_v2_test.py
index 2c990261055a19c34b4b292c273e613e073a2bce..050da5ff6cc98c02b543fe40c477daa8dd0ec7aa 100644
--- a/tensorflow/python/kernel_tests/cond_v2_test.py
+++ b/tensorflow/python/kernel_tests/cond_v2_test.py
@@ -145,6 +145,22 @@ class CondV2Test(test.TestCase):
     self.assertEqual(cond_op.type, "If")
     return output, cond_op
 
+  def _createNestedCond(self, name):
+    """Like _createCond but creates a nested cond_v2 call as well."""
+    pred = constant_op.constant(True, name="pred")
+    x = constant_op.constant(1.0, name="x")
+
+    def true_fn():
+      return cond_v2.cond_v2(pred, lambda: x, lambda: x + 1)
+
+    def false_fn():
+      return x + 2
+
+    output = cond_v2.cond_v2(pred, true_fn, false_fn, name=name)
+    cond_op = output.op.inputs[0].op
+    self.assertEqual(cond_op.type, "If")
+    return output, cond_op
+
   def testDefaultName(self):
     with ops.Graph().as_default():
       _, cond_op = self._createCond(None)
@@ -645,9 +661,14 @@ class CondV2Test(test.TestCase):
       # Build the cond_v2 in an XLA context
       xla_context = control_flow_ops.XLAControlFlowContext()
       xla_context.Enter()
-      cond_output, _ = self._createCond("cond")
+      cond_output, cond_op = self._createCond("cond")
       xla_context.Exit()
 
+      # Check lowering attr is not set.
+      with self.assertRaises(ValueError):
+        cond_op.get_attr("_lower_using_switch_merge")
+
+      # Check the actual graph that is run.
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
       run_metadata = config_pb2.RunMetadata()
       sess.run(cond_output, options=run_options, run_metadata=run_metadata)
@@ -672,6 +693,29 @@ class CondV2Test(test.TestCase):
           if_found,
           "An `If` op was not found, but the graph should not be lowered.")
 
+  @test_util.run_deprecated_v1
+  def testNestedLoweringDisabledInXLA(self):
+    # Build the cond_v2 in an XLA context
+    xla_context = control_flow_ops.XLAControlFlowContext()
+    xla_context.Enter()
+    _, cond_op = self._createNestedCond("cond")
+    xla_context.Exit()
+
+    # Check lowering attr is not set for either If node.
+    with self.assertRaises(ValueError):
+      cond_op.get_attr("_lower_using_switch_merge")
+
+    nested_if_ops = []
+    for func in ops.get_default_graph()._functions.values():
+      nested_if_ops.extend(op for op in func._graph.get_operations()
+                           if op.type == "If")
+    self.assertEqual(len(nested_if_ops), 1)
+    with self.assertRaises(ValueError):
+      nested_if_ops[0].get_attr("_lower_using_switch_merge")
+
+    # TODO(skyewm): check the actual graphs that are run once we have a way to
+    # programmatically access those graphs.
+
   @test_util.run_deprecated_v1
   def testLoweringDisabledWithSingleThreadedExecutorContext(self):
     with self.session(graph=ops.Graph()) as sess:
diff --git a/tensorflow/python/kernel_tests/confusion_matrix_test.py b/tensorflow/python/kernel_tests/confusion_matrix_test.py
index ae13c8e32e5ed5c8f3e6b670835db66d1e7dad0f..0ea5b1f5d8c35a1d5f7e883872475fdeb97688c6 100644
--- a/tensorflow/python/kernel_tests/confusion_matrix_test.py
+++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py
@@ -470,9 +470,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
       }
-      with self.assertRaisesRegexp(
-          errors_impl.InvalidArgumentError,
-          "Can not squeeze dim\[2\]"):
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   r"Can not squeeze dim\[2\]"):
         dynamic_labels.eval(feed_dict=feed_dict)
       self.assertAllEqual(
           prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
@@ -498,9 +497,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
       }
       self.assertAllEqual(
           label_values, dynamic_labels.eval(feed_dict=feed_dict))
-      with self.assertRaisesRegexp(
-          errors_impl.InvalidArgumentError,
-          "Can not squeeze dim\[2\]"):
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   r"Can not squeeze dim\[2\]"):
         dynamic_predictions.eval(feed_dict=feed_dict)
 
 
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index fa62acbfebfd1a722fd6b9b4066fde5e5fd690f4..2949038728601c6b5ae79952de9ce482ecb2f665 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -1183,6 +1183,8 @@ class ControlFlowTest(test.TestCase):
 
   @test_util.run_v1_only("b/120545219")
   def testInvalidMaximumIterationsWhileLoopGradientInXLAContext(self):
+    if control_flow_util.ENABLE_CONTROL_FLOW_V2:
+      self.skipTest("WhileV2 does lazy evaluation of maximum_iterations")
     v = constant_op.constant(1.0)
 
     def inner_body(i, x):
@@ -1203,44 +1205,27 @@ class ControlFlowTest(test.TestCase):
     gs = gradients_impl.gradients(loop_no_xla, v)
     self.evaluate(gs)  # This should execute without error.
 
-    if control_flow_util.ENABLE_CONTROL_FLOW_V2:
-      xla_context = control_flow_ops.XLAControlFlowContext()
-      xla_context.Enter()
-      with self.assertRaisesRegexp(
-          ValueError,
-          r"maximum_iterations is None. It is required and must be statically "
-          r"known \(e.g. a constant value or known shape dimension\) when "
-          r"building while_loop in XLA context."):
-        loop_no_maxiter = create_while_loop()
-      with self.assertRaisesRegexp(
-          ValueError,
-          r"maximum_iterations must be statically "
-          r"known \(e.g. a constant value or known shape dimension\) when "
-          r"building while_loop in XLA context."):
-        loop_with_maxiter = create_while_loop(maximum_iterations=2)
-      xla_context.Exit()
-    else:
-      xla_context = control_flow_ops.XLAControlFlowContext()
-      xla_context.Enter()
-      loop_no_maxiter = create_while_loop()
-      loop_with_maxiter = create_while_loop(maximum_iterations=2)
-      xla_context.Exit()
+    xla_context = control_flow_ops.XLAControlFlowContext()
+    xla_context.Enter()
+    loop_no_maxiter = create_while_loop()
+    loop_with_maxiter = create_while_loop(maximum_iterations=2)
+    xla_context.Exit()
 
-      with self.assertRaisesRegexp(
-          ValueError,
-          r"Cannot create a gradient accumulator for tensor '.+' inside "
-          r"XLA while_loop because maximum_iterations was not passed to "
-          r"the tf.while_loop call \('.+'\)."):
-        _ = gradients_impl.gradients(loop_no_maxiter, v)
+    with self.assertRaisesRegexp(
+        ValueError,
+        r"Cannot create a gradient accumulator for tensor '.+' inside "
+        r"XLA while_loop because maximum_iterations was not passed to "
+        r"the tf.while_loop call \('.+'\)."):
+      _ = gradients_impl.gradients(loop_no_maxiter, v)
 
-      with self.assertRaisesRegexp(
-          ValueError,
-          r"Cannot create a gradient accumulator for tensor '.+' inside XLA "
-          r"while_loop. maximum_iterations tensor '.+' for while_loop context "
-          r"'.+' must be statically known \(e.g. a constant value or known "
-          r"shape dimension\), or be defined at or outside the while loop "
-          r"context '.*' \(currently defined in '.*'\)"):
-        _ = gradients_impl.gradients(loop_with_maxiter, v)
+    with self.assertRaisesRegexp(
+        ValueError,
+        r"Cannot create a gradient accumulator for tensor '.+' inside XLA "
+        r"while_loop. maximum_iterations tensor '.+' for while_loop context "
+        r"'.+' must be statically known \(e.g. a constant value or known "
+        r"shape dimension\), or be defined at or outside the while loop "
+        r"context '.*' \(currently defined in '.*'\)"):
+      _ = gradients_impl.gradients(loop_with_maxiter, v)
 
   @test_util.run_v1_only("b/120545219")
   def testInvalidMaximumIterationsFromSiblingContextWhileLoopInXLAContext(self):
@@ -1265,10 +1250,7 @@ class ControlFlowTest(test.TestCase):
       xla_context = control_flow_ops.XLAControlFlowContext()
       xla_context.Enter()
       with self.assertRaisesRegexp(
-          ValueError,
-          r"maximum_iterations must be statically known \(e.g. a constant value"
-          r" or known shape dimension\) when building while_loop in XLA "
-          r"context."):
+          ValueError, r"Tensor.*Placeholder:0.* must be from the same graph.*"):
         loop = create_while_loop()
       xla_context.Exit()
     else:
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 2f6f3bb383b381de1dac78cc72882fe5fe4291c9..7ff1a61e472b0dae054804f8f014ead7782958b6 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -26,13 +26,18 @@ import numpy as np
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.contrib import layers
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_impl
@@ -165,6 +170,12 @@ class Conv2DTest(test.TestCase):
       # as we will be using its gradients as reference for fp16 gradients.
       return [dtypes.float32, dtypes.float16, dtypes.float64]
 
+  def _CreateNumpyTensor(self, shape):
+    total_size = 1
+    for s in shape:
+      total_size *= s
+    return np.arange(1, total_size + 1, dtype=np.float32).reshape(shape)
+
   def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, dilations,
                             strides, padding, data_format, dtype, use_gpu):
     """Verifies the output values of the convolution function.
@@ -183,26 +194,22 @@ class Conv2DTest(test.TestCase):
     Returns:
       Symbolic tensor value that can be used to execute the computation
     """
-    total_size_1 = 1
-    total_size_2 = 1
-    for s in tensor_in_sizes:
-      total_size_1 *= s
-    for s in filter_in_sizes:
-      total_size_2 *= s
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
-    x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
+    x1 = self._CreateNumpyTensor(tensor_in_sizes)
+    x2 = self._CreateNumpyTensor(filter_in_sizes)
 
     with test_util.device(use_gpu):
       t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=dtype)
       t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=dtype)
       strides = [1] + strides + [1]
       dilations = [1] + dilations + [1]
+      if isinstance(padding, (list, tuple)):
+        padding = [(0, 0)] + padding + [(0, 0)]
       if data_format == "NCHW":
         t1 = test_util.NHWCToNCHW(t1)
         strides = test_util.NHWCToNCHW(strides)
         dilations = test_util.NHWCToNCHW(dilations)
+        if isinstance(padding, (list, tuple)):
+          padding = test_util.NHWCToNCHW(padding)
       conv = nn_ops.conv2d(
           t1,
           t2,
@@ -254,17 +261,8 @@ class Conv2DTest(test.TestCase):
   def _ComputeReferenceDilatedConv(self, tensor_in_sizes, filter_in_sizes,
                                    stride, dilation, padding, data_format,
                                    use_gpu):
-    total_size_1 = 1
-    total_size_2 = 1
-    for s in tensor_in_sizes:
-      total_size_1 *= s
-    for s in filter_in_sizes:
-      total_size_2 *= s
-
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
-    x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
+    x1 = self._CreateNumpyTensor(tensor_in_sizes)
+    x2 = self._CreateNumpyTensor(filter_in_sizes)
     with test_util.device(use_gpu):
       t1 = constant_op.constant(x1, shape=tensor_in_sizes)
       t2 = constant_op.constant(x2, shape=filter_in_sizes)
@@ -312,16 +310,29 @@ class Conv2DTest(test.TestCase):
       expected_values = self.evaluate(expected_results)
       computed_values = self.evaluate(computed_results)
       for e_value, c_value in zip(expected_values, computed_values):
-        tf_logging.info("expected = ", e_value)
-        tf_logging.info("actual = ", c_value)
+        tf_logging.debug("expected = %s", e_value)
+        tf_logging.debug("actual = %s", c_value)
         self.assertAllClose(
             e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=1e-4)
 
-  def _VerifyValues(self, tensor_in_sizes, filter_in_sizes, strides, padding,
-                    expected):
+  def _VerifyValues(self,
+                    tensor_in_sizes,
+                    filter_in_sizes,
+                    strides,
+                    padding,
+                    expected,
+                    dilations=(1, 1),
+                    gpu_only=False,
+                    test_grappler_layout_optimizer=False,
+                    tol=1e-5,
+                    fp16_tol=1e-3):
+    if gpu_only and not test.is_gpu_available(cuda_only=True):
+      return
     tensors = []
-    dilations = [1, 1]
+    dilations = list(dilations)
     for (data_format, use_gpu) in GetTestConfigs():
+      if gpu_only and not use_gpu:
+        continue
       for dtype in self._DtypesToTest(use_gpu):
         result = self._SetupValuesForDevice(
             tensor_in_sizes,
@@ -332,19 +343,71 @@ class Conv2DTest(test.TestCase):
             data_format,
             dtype,
             use_gpu=use_gpu)
+        if test_grappler_layout_optimizer and data_format == "NHWC" and use_gpu:
+          # Grappler's layout optimizer will not optimize a fetch node, so
+          # this identity allows Grappler to optimize the Conv2D node.
+          result = array_ops.identity(result)
         tensors.append(result)
       values = self.evaluate(tensors)
       for i in range(len(tensors)):
         conv = tensors[i]
         value = values[i]
-        tf_logging.info("expected = ", expected)
-        tf_logging.info("actual = ", value)
-        tol = 1e-5
-        if value.dtype == np.float16:
-          tol = 1e-3
-        self.assertAllClose(expected, np.ravel(value), atol=tol, rtol=tol)
+        tf_logging.debug("expected = %s", expected)
+        tf_logging.debug("actual = %s", value)
+        tol_to_use = fp16_tol if value.dtype == np.float16 else tol
+        self.assertAllClose(expected, np.ravel(value), atol=tol_to_use,
+                            rtol=tol_to_use)
         self.assertShapeEqual(value, conv)
 
+  def _VerifyExplicitPaddings(self,
+                              tensor_in_sizes,
+                              filter_in_sizes,
+                              strides,
+                              padding,
+                              dilations=(1, 1),
+                              test_grappler_layout_optimizer=False,
+                              tol=1e-5,
+                              fp16_tol=1e-3):
+    """Verifies Conv2D with explicit padding generates correct values.
+
+    It does this by comparing with Conv2D without explicit padding. This
+    function assumes Conv2D without explicit padding works correctly.
+
+    Args:
+      tensor_in_sizes: Input tensor dimensions in [batch, input_rows,
+        input_cols, input_depth].
+      filter_in_sizes: Filter tensor dimensions in [kernel_rows, kernel_cols,
+        input_depth, output_depth].
+      strides: [row_stride, col_stride] for the convolution;
+      padding: Explicit padding amounts.
+      dilations: Dilation values
+      test_grappler_layout_optimizer: If True, allow the Grappler layout
+        optimizer to run, which turns NHWC Conv2Ds on the GPU to NCHW Conv2Ds.
+      tol: The absolute and relative tolerance for non-fp16 dtypes.
+      fp16_tol: The absolute and relative tolerance for fp16.
+    """
+    input_tensor = self._CreateNumpyTensor(tensor_in_sizes)
+    filter_tensor = self._CreateNumpyTensor(filter_in_sizes)
+    input_tensor = array_ops.pad(input_tensor, [(0, 0)] + padding + [(0, 0)])
+    dilations = list(dilations)
+    conv2d_result = nn_ops.conv2d(
+        input_tensor,
+        filter_tensor, [1] + list(strides) + [1],
+        "VALID",
+        dilations=[1] + dilations + [1])
+    expected = list(self.evaluate(array_ops.reshape(conv2d_result, [-1])))
+    self._VerifyValues(
+        tensor_in_sizes,
+        filter_in_sizes,
+        strides,
+        padding,
+        expected,
+        dilations,
+        gpu_only=True,
+        test_grappler_layout_optimizer=test_grappler_layout_optimizer,
+        tol=tol,
+        fp16_tol=fp16_tol)
+
   @test_util.run_in_graph_and_eager_modes
   def testConv2D1x1Filter(self):
     expected_output = [
@@ -510,6 +573,126 @@ class Conv2DTest(test.TestCase):
         dilations=[2, 2],
         padding="VALID")
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D0x0Padding(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[2, 2, 3, 3],
+        strides=[1, 1],
+        padding=[[0, 0], [0, 0]])
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[3, 4, 3, 2],
+        filter_in_sizes=[1, 1, 2, 1],
+        strides=[2, 2],
+        padding=[[0, 0], [0, 0]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D1x1Padding(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 3, 2],
+        filter_in_sizes=[2, 2, 2, 2],
+        strides=[1, 1],
+        padding=[[1, 1], [1, 1]])
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 2, 1],
+        filter_in_sizes=[1, 1, 1, 2],
+        strides=[1, 1],
+        padding=[[1, 1], [1, 1]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Padding(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 1, 2],
+        filter_in_sizes=[2, 1, 2, 1],
+        strides=[1, 1],
+        padding=[[2, 2], [2, 2]])
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 1, 2],
+        filter_in_sizes=[1, 1, 2, 1],
+        strides=[2, 1],
+        padding=[[2, 2], [2, 2]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2DOnlyBottomPadding(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[2, 2, 3, 2],
+        strides=[1, 1],
+        padding=[[0, 3], [0, 0]], tol=2e-5)
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[2, 2, 4, 3],
+        filter_in_sizes=[1, 2, 3, 2],
+        strides=[2, 2],
+        padding=[[0, 3], [0, 0]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2DOnlyTopRightPadding(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[2, 2, 3, 2],
+        strides=[1, 1],
+        padding=[[1, 0], [0, 2]],
+        tol=5e-5)
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 4, 2],
+        filter_in_sizes=[2, 2, 2, 2],
+        strides=[1, 3],
+        padding=[[1, 0], [0, 2]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2DLotsPadding(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 1, 1, 3],
+        filter_in_sizes=[2, 2, 3, 3],
+        strides=[1, 1],
+        padding=[[3, 4], [4, 2]])
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 1, 1],
+        filter_in_sizes=[2, 2, 1, 3],
+        strides=[2, 1],
+        padding=[[3, 4], [4, 2]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2DExplicitPaddingWithDilations(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 3, 2, 1],
+        filter_in_sizes=[1, 2, 1, 2],
+        strides=[1, 1],
+        padding=[[1, 0], [0, 1]],
+        dilations=[2, 1])
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 3, 2],
+        filter_in_sizes=[3, 2, 2, 1],
+        strides=[1, 1],
+        padding=[[2, 1], [1, 2]],
+        dilations=[2, 3])
+
+  def testConv2DExplicitPaddingWithLayoutOptimizer(self):
+    # Test with Grappler's layout optimizer, to ensure the layout optimizer
+    # handles explicit padding correctly.
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 3, 2, 1],
+        filter_in_sizes=[1, 2, 1, 2],
+        strides=[1, 1],
+        padding=[[1, 0], [0, 1]],
+        dilations=[2, 1],
+        test_grappler_layout_optimizer=True)
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 3, 2],
+        filter_in_sizes=[3, 2, 2, 1],
+        strides=[1, 1],
+        padding=[[2, 1], [1, 2]],
+        dilations=[2, 3],
+        test_grappler_layout_optimizer=True)
+
   # TODO(yzhwang): this currently fails.
   # self._VerifyValues(tensor_in_sizes=[1, 8, 8, 1],
   #                   filter_in_sizes=[2, 2, 1, 1],
@@ -517,19 +700,22 @@ class Conv2DTest(test.TestCase):
   #                   expected=[72, 112, 392, 432])
 
   # Testing for backprops
-  def _RunAndVerifyBackpropInput(self, input_sizes, filter_sizes, output_sizes,
-                                 strides, padding, expected, data_format,
-                                 use_gpu, err):
-    total_output_size = 1
-    total_filter_size = 1
-    for s in output_sizes:
-      total_output_size *= s
-    for s in filter_sizes:
-      total_filter_size *= s
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    x1 = [f * 1.0 for f in range(1, total_filter_size + 1)]
-    x2 = [f * 1.0 for f in range(1, total_output_size + 1)]
+  def _RunAndVerifyBackpropInput(self,
+                                 input_sizes,
+                                 filter_sizes,
+                                 output_sizes,
+                                 strides,
+                                 padding,
+                                 expected,
+                                 data_format,
+                                 use_gpu,
+                                 err,
+                                 dilations=(1, 1)):
+    if use_gpu and not test.is_gpu_available(cuda_only=True):
+      return
+    x1 = self._CreateNumpyTensor(filter_sizes)
+    x2 = self._CreateNumpyTensor(output_sizes)
+    dilations = list(dilations)
     with test_util.device(use_gpu):
       if data_format == "NCHW":
         input_sizes = test_util.NHWCToNCHW(input_sizes)
@@ -537,18 +723,30 @@ class Conv2DTest(test.TestCase):
       t1 = constant_op.constant(x1, shape=filter_sizes)
       t2 = constant_op.constant(x2, shape=output_sizes)
       strides = [1] + strides + [1]
+      dilations = [1] + dilations + [1]
+      if isinstance(padding, (list, tuple)):
+        padding = [(0, 0)] + padding + [(0, 0)]
       if data_format == "NCHW":
         t2 = test_util.NHWCToNCHW(t2)
         strides = test_util.NHWCToNCHW(strides)
+        dilations = test_util.NHWCToNCHW(dilations)
+        if isinstance(padding, (list, tuple)):
+          padding = test_util.NHWCToNCHW((padding))
       conv = nn_ops.conv2d_backprop_input(
-          t0, t1, t2, strides=strides, padding=padding, data_format=data_format)
+          t0,
+          t1,
+          t2,
+          strides=strides,
+          padding=padding,
+          data_format=data_format,
+          dilations=dilations)
       if data_format == "NCHW":
         conv = test_util.NCHWToNHWC(conv)
       # "values" consists of two tensors for two backprops
       value = self.evaluate(conv)
       self.assertShapeEqual(value, conv)
-    tf_logging.info("expected = ", expected)
-    tf_logging.info("actual = ", value)
+    tf_logging.debug("expected = %s", expected)
+    tf_logging.debug("actual = %s", value)
     self.assertArrayNear(expected, value.flatten(), err)
 
   def _CompareBackpropInput(self, input_sizes, filter_sizes, output_sizes,
@@ -691,41 +889,51 @@ class Conv2DTest(test.TestCase):
           err=1e-5)
 
   # Testing for backprops
-  def _RunAndVerifyBackpropFilter(self, input_sizes, filter_sizes, output_sizes,
-                                  strides, padding, expected, data_format,
-                                  use_gpu):
-    total_input_size = 1
-    total_output_size = 1
-    for s in input_sizes:
-      total_input_size *= s
-    for s in output_sizes:
-      total_output_size *= s
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    x0 = [f * 1.0 for f in range(1, total_input_size + 1)]
-    x2 = [f * 1.0 for f in range(1, total_output_size + 1)]
+  def _RunAndVerifyBackpropFilter(self,
+                                  input_sizes,
+                                  filter_sizes,
+                                  output_sizes,
+                                  strides,
+                                  padding,
+                                  expected,
+                                  data_format,
+                                  use_gpu,
+                                  dilations=(1, 1),
+                                  err=1e-5):
+    x0 = self._CreateNumpyTensor(input_sizes)
+    x2 = self._CreateNumpyTensor(output_sizes)
+    dilations = list(dilations)
+    explicit_strides = [1] + strides + [1]
+    new_padding = padding
+    new_dilations = [1] + dilations + [1]
+    if isinstance(new_padding, (list, tuple)):
+      new_padding = [(0, 0)] + new_padding + [(0, 0)]
+    if data_format == "NCHW":
+      explicit_strides = test_util.NHWCToNCHW(explicit_strides)
+      new_dilations = test_util.NHWCToNCHW(new_dilations)
+      if isinstance(padding, (list, tuple)):
+        new_padding = test_util.NHWCToNCHW(new_padding)
     for dtype in self._DtypesToTest(use_gpu=use_gpu):
       with test_util.device(use_gpu):
         t0 = constant_op.constant(x0, shape=input_sizes, dtype=dtype)
         t1 = constant_op.constant(filter_sizes, shape=[len(filter_sizes)])
         t2 = constant_op.constant(x2, shape=output_sizes, dtype=dtype)
-        explicit_strides = [1] + strides + [1]
         if data_format == "NCHW":
           t0 = test_util.NHWCToNCHW(t0)
           t2 = test_util.NHWCToNCHW(t2)
-          explicit_strides = test_util.NHWCToNCHW(explicit_strides)
         conv = nn_ops.conv2d_backprop_filter(
             t0,
             t1,
             t2,
             strides=explicit_strides,
-            padding=padding,
+            padding=new_padding,
+            dilations=new_dilations,
             data_format=data_format)
         value = self.evaluate(conv)
         self.assertShapeEqual(value, conv)
-      tf_logging.info("expected = ", expected)
-      tf_logging.info("actual = ", value)
-      self.assertArrayNear(expected, value.flatten(), 1e-5)
+      tf_logging.debug("expected = %s", expected)
+      tf_logging.debug("actual = %s", value)
+      self.assertArrayNear(expected, value.flatten(), err)
 
   def _CompareBackFilter(self, input_sizes, filter_sizes, output_sizes,
                          conv_strides, padding):
@@ -866,16 +1074,8 @@ class Conv2DTest(test.TestCase):
   def _RunAndVerifyBackpropInputDilation(self, input_sizes, filter_sizes,
                                          output_sizes, strides, dilations,
                                          padding, data_format, use_gpu, err):
-    total_input_size = 1
-    total_filter_size = 1
-    for s in input_sizes:
-      total_input_size *= s
-    for s in filter_sizes:
-      total_filter_size *= s
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    x1 = [f * 1.0 for f in range(1, total_input_size + 1)]
-    x2 = [f * 1.0 for f in range(1, total_filter_size + 1)]
+    x1 = self._CreateNumpyTensor(input_sizes)
+    x2 = self._CreateNumpyTensor(filter_sizes)
     default_dilations = (dilations[0] == 1 and dilations[1] == 1)
     if default_dilations or use_gpu:
       with self.cached_session(use_gpu=use_gpu) as sess:
@@ -912,24 +1112,16 @@ class Conv2DTest(test.TestCase):
         value_2 = self.evaluate(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
-      tf_logging.info("expected = ", value_2)
-      tf_logging.info("actual = ", value)
+      tf_logging.debug("expected = %s", value_2)
+      tf_logging.debug("actual = %s", value)
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
   # Testing for backprops
   def _RunAndVerifyBackpropFilterDilation(self, input_sizes, filter_sizes,
                                           output_sizes, strides, dilations,
                                           padding, data_format, use_gpu, err):
-    total_input_size = 1
-    total_filter_size = 1
-    for s in input_sizes:
-      total_input_size *= s
-    for s in filter_sizes:
-      total_filter_size *= s
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    x1 = [f * 1.0 for f in range(1, total_input_size + 1)]
-    x2 = [f * 1.0 for f in range(1, total_filter_size + 1)]
+    x1 = self._CreateNumpyTensor(input_sizes)
+    x2 = self._CreateNumpyTensor(filter_sizes)
     default_dilations = (dilations[0] == 1 and dilations[1] == 1)
     if default_dilations or use_gpu:
       with self.cached_session(use_gpu=use_gpu) as sess:
@@ -965,8 +1157,8 @@ class Conv2DTest(test.TestCase):
         value_2 = self.evaluate(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
-      tf_logging.info("expected = ", value_2)
-      tf_logging.info("actual = ", value)
+      tf_logging.debug("expected = %s", value_2)
+      tf_logging.debug("actual = %s", value)
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
   def testConv2D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1(self):
@@ -1111,20 +1303,347 @@ class Conv2DTest(test.TestCase):
             use_gpu=use_gpu,
             err=1e-5)
 
+  def _RunAndVerifyBackpropInputExplicitPadding(self,
+                                                input_sizes,
+                                                filter_sizes,
+                                                output_sizes,
+                                                strides,
+                                                padding,
+                                                data_format,
+                                                dilations=(1, 1),
+                                                err=2e-5):
+    x1 = self._CreateNumpyTensor(filter_sizes)
+    x2 = self._CreateNumpyTensor(output_sizes)
+    dilations = list(dilations)
+    padded_input_sizes = input_sizes[:]
+    padded_input_sizes[1] += padding[0][0] + padding[0][1]
+    padded_input_sizes[2] += padding[1][0] + padding[1][1]
+    c = nn_ops.conv2d_backprop_input(
+        padded_input_sizes,
+        x1,
+        x2,
+        strides=[1] + strides + [1],
+        padding="VALID",
+        dilations=[1] + dilations + [1])
+    c = c[:, padding[0][0]:(c.shape[1] - padding[0][1]), padding[1][0]:(
+        c.shape[2] - padding[1][1]), :]
+    expected = list(self.evaluate(array_ops.reshape(c, [-1])))
+    self._RunAndVerifyBackpropInput(
+        input_sizes,
+        filter_sizes,
+        output_sizes,
+        strides,
+        padding,
+        expected,
+        data_format,
+        use_gpu=True,
+        err=err,
+        dilations=dilations)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Depth1Padding0x0BackpropInput(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 1, 2, 1],
+            strides=[1, 1],
+            padding=[[0, 0], [0, 0]],
+            data_format=data_format)
+
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 3, 4, 2],
+            filter_sizes=[2, 2, 2, 3],
+            output_sizes=[1, 1, 2, 3],
+            strides=[2, 2],
+            padding=[[0, 0], [0, 0]],
+            data_format=data_format)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Depth1Padding1x1BackpropInput(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 2],
+            output_sizes=[1, 3, 4, 2],
+            strides=[1, 1],
+            padding=[[1, 1], [1, 1]],
+            data_format=data_format, err=1e-4)
+
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 2, 3, 2],
+            filter_sizes=[1, 1, 2, 1],
+            output_sizes=[1, 4, 3, 1],
+            strides=[1, 2],
+            padding=[[1, 1], [1, 1]],
+            data_format=data_format)
+
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 4, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 4, 2, 1],
+            strides=[1, 2],
+            padding=[[1, 1], [1, 1]],
+            data_format=data_format,
+            dilations=[2, 2])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Depth1Padding2x2BackpropInput(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[2, 3, 1, 1],
+            filter_sizes=[2, 1, 1, 1],
+            output_sizes=[2, 2, 5, 1],
+            strides=[3, 1],
+            padding=[[2, 2], [2, 2]],
+            data_format=data_format)
+
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 3, 6, 1],
+            filter_sizes=[3, 2, 1, 1],
+            output_sizes=[1, 3, 4, 1],
+            strides=[1, 2],
+            padding=[[2, 2], [2, 2]],
+            data_format=data_format,
+            dilations=[2, 3])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Depth1Padding_1_8_4_1_BackpropInput(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 10, 8, 1],
+            strides=[1, 1],
+            padding=[[1, 8], [4, 2]],
+            data_format=data_format, err=5e-5)
+
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 5, 3, 1],
+            filter_sizes=[3, 2, 1, 1],
+            output_sizes=[1, 4, 8, 1],
+            strides=[3, 1],
+            padding=[[1, 8], [4, 2]],
+            data_format=data_format)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Depth1Padding_5_0_2_2_BackpropInput(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 3, 3, 1],
+            filter_sizes=[2, 1, 1, 1],
+            output_sizes=[1, 7, 7, 1],
+            strides=[1, 1],
+            padding=[[5, 0], [2, 2]],
+            data_format=data_format,
+            err=5e-5)
+
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 4, 2, 1],
+            filter_sizes=[3, 3, 1, 1],
+            output_sizes=[1, 5, 2, 1],
+            strides=[1, 2],
+            padding=[[5, 0], [2, 2]],
+            data_format=data_format,
+            dilations=[2, 1])
+
+  def _RunAndVerifyBackpropFilterExplicitPadding(self,
+                                                 input_sizes,
+                                                 filter_sizes,
+                                                 output_sizes,
+                                                 strides,
+                                                 padding,
+                                                 data_format,
+                                                 dilations=(1, 1),
+                                                 err=1e-5):
+    x0 = self._CreateNumpyTensor(input_sizes)
+    x2 = self._CreateNumpyTensor(output_sizes)
+    dilations = list(dilations)
+
+    x0 = np.pad(x0, [(0, 0)] + padding + [(0, 0)], "constant")
+    c = nn_ops.conv2d_backprop_filter(
+        x0,
+        filter_sizes,
+        x2,
+        strides=[1] + strides + [1],
+        padding="VALID",
+        dilations=[1] + dilations + [1])
+    expected = list(self.evaluate(array_ops.reshape(c, [-1])))
+    self._RunAndVerifyBackpropFilter(
+        input_sizes,
+        filter_sizes,
+        output_sizes,
+        strides,
+        padding,
+        expected,
+        data_format,
+        use_gpu=True,
+        dilations=dilations,
+        err=err)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Depth1Padding0x0BackpropFilter(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 1, 2, 1],
+            strides=[1, 1],
+            padding=[[0, 0], [0, 0]],
+            data_format=data_format)
+
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 3, 4, 2],
+            filter_sizes=[2, 2, 2, 3],
+            output_sizes=[1, 1, 2, 3],
+            strides=[2, 2],
+            padding=[[0, 0], [0, 0]],
+            data_format=data_format)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Depth1Padding1x1BackpropFilter(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 2],
+            output_sizes=[1, 3, 4, 2],
+            strides=[1, 1],
+            padding=[[1, 1], [1, 1]],
+            data_format=data_format,
+            err=5e-5)
+
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 2, 3, 2],
+            filter_sizes=[1, 1, 2, 1],
+            output_sizes=[1, 4, 3, 1],
+            strides=[1, 2],
+            padding=[[1, 1], [1, 1]],
+            data_format=data_format)
+
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 4, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 4, 2, 1],
+            strides=[1, 2],
+            padding=[[1, 1], [1, 1]],
+            data_format=data_format,
+            dilations=[2, 2])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Depth1Padding2x2BackpropFilter(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[2, 3, 1, 1],
+            filter_sizes=[2, 1, 1, 1],
+            output_sizes=[2, 2, 5, 1],
+            strides=[3, 1],
+            padding=[[2, 2], [2, 2]],
+            data_format=data_format)
+
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 3, 6, 1],
+            filter_sizes=[3, 2, 1, 1],
+            output_sizes=[1, 3, 4, 1],
+            strides=[1, 2],
+            padding=[[2, 2], [2, 2]],
+            data_format=data_format,
+            dilations=[2, 3])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Depth1Padding_1_8_4_1_BackpropFilter(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 10, 8, 1],
+            strides=[1, 1],
+            padding=[[1, 8], [4, 2]],
+            data_format=data_format,
+            err=1e-4)
+
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 5, 3, 1],
+            filter_sizes=[3, 2, 1, 1],
+            output_sizes=[1, 4, 8, 1],
+            strides=[3, 1],
+            padding=[[1, 8], [4, 2]],
+            data_format=data_format)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Depth1Padding_5_0_2_2_BackpropFilter(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 3, 3, 1],
+            filter_sizes=[2, 1, 1, 1],
+            output_sizes=[1, 7, 7, 1],
+            strides=[1, 1],
+            padding=[[5, 0], [2, 2]],
+            data_format=data_format,
+            err=1e-4)
+
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 4, 2, 1],
+            filter_sizes=[3, 3, 1, 1],
+            output_sizes=[1, 5, 2, 1],
+            strides=[1, 2],
+            padding=[[5, 0], [2, 2]],
+            data_format=data_format,
+            dilations=[2, 1])
+
   # Gradient checkers
   def ConstructAndTestGradient(self, batch, input_rows, input_cols, filter_rows,
                                filter_cols, in_depth, out_depth, stride_rows,
                                stride_cols, padding, test_input, data_format,
-                               use_gpu):
+                               use_gpu, max_err=0.002):
     input_shape = [batch, input_rows, input_cols, in_depth]
     filter_shape = [filter_rows, filter_cols, in_depth, out_depth]
     # TODO(yangke): re-factor the computation of output shape.
     if padding == "VALID":
       output_rows = (input_rows - filter_rows + stride_rows) // stride_rows
       output_cols = (input_cols - filter_cols + stride_cols) // stride_cols
-    else:
+    elif padding == "SAME":
       output_rows = (input_rows + stride_rows - 1) // stride_rows
       output_cols = (input_cols + stride_cols - 1) // stride_cols
+    else:
+      self.assertIsInstance(padding, (list, tuple))
+      output_rows = (input_rows + padding[1][0] + padding[1][1] - filter_rows +
+                     stride_rows) // stride_rows
+      output_cols = (input_cols + padding[2][0] + padding[2][1] - filter_cols +
+                     stride_cols) // stride_cols
     output_shape = [batch, output_rows, output_cols, out_depth]
     input_size = 1
     for x in input_shape:
@@ -1145,16 +1664,19 @@ class Conv2DTest(test.TestCase):
         filter_tensor = constant_op.constant(
             filter_data, shape=filter_shape, dtype=dtype, name="filter")
         strides = [1, stride_rows, stride_cols, 1]
+        new_padding = padding
         if data_format == "NCHW":
           new_input_tensor = test_util.NHWCToNCHW(input_tensor)
           strides = test_util.NHWCToNCHW(strides)
+          if isinstance(padding, (list, tuple)):
+            new_padding = test_util.NHWCToNCHW(padding)
         else:
           new_input_tensor = input_tensor
         conv = nn_ops.conv2d(
             new_input_tensor,
             filter_tensor,
             strides,
-            padding,
+            new_padding,
             data_format=data_format,
             name="conv")
         if data_format == "NCHW":
@@ -1178,8 +1700,8 @@ class Conv2DTest(test.TestCase):
           # since fp16 numerical gradients are too imprecise.
           err = np.fabs(jacob_t - reference_jacob_t).max()
 
-        tf_logging.info("conv_2d gradient error = ", err)
-        self.assertLess(err, 0.002)
+        tf_logging.debug("conv_2d gradient error = %s", err)
+        self.assertLess(err, max_err)
 
   def testInputGradientValidPaddingStrideOne(self):
     for (data_format, use_gpu) in GetTestConfigs():
@@ -1436,6 +1958,248 @@ class Conv2DTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  def testInputGradient1x1PaddingStrideOne(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=5,
+            input_cols=4,
+            filter_rows=3,
+            filter_cols=3,
+            in_depth=2,
+            out_depth=3,
+            stride_rows=1,
+            stride_cols=1,
+            padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+            test_input=True,
+            data_format=data_format,
+            use_gpu=use_gpu,
+            max_err=0.0025)
+
+  def testFilterGradient1x1PaddingStrideOne(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=5,
+            input_cols=4,
+            filter_rows=3,
+            filter_cols=3,
+            in_depth=2,
+            out_depth=3,
+            stride_rows=1,
+            stride_cols=1,
+            padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+            test_input=False,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  def testInputGradient1x1PaddingStrideTwo(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=4,
+            input_cols=5,
+            filter_rows=3,
+            filter_cols=3,
+            in_depth=2,
+            out_depth=3,
+            stride_rows=2,
+            stride_cols=2,
+            padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+            test_input=True,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  def testFilterGradient1x1PaddingStrideTwo(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=4,
+            input_cols=5,
+            filter_rows=3,
+            filter_cols=3,
+            in_depth=2,
+            out_depth=3,
+            stride_rows=2,
+            stride_cols=2,
+            padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+            test_input=False,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  def testInputGradient2x2PaddingStrideOne(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=5,
+            input_cols=4,
+            filter_rows=3,
+            filter_cols=3,
+            in_depth=2,
+            out_depth=3,
+            stride_rows=1,
+            stride_cols=1,
+            padding=[[0, 0], [2, 2], [2, 2], [0, 0]],
+            test_input=True,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  def testFilterGradient2x2PaddingStrideOne(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=5,
+            input_cols=4,
+            filter_rows=3,
+            filter_cols=3,
+            in_depth=2,
+            out_depth=3,
+            stride_rows=1,
+            stride_cols=1,
+            padding=[[0, 0], [2, 2], [2, 2], [0, 0]],
+            test_input=False,
+            data_format=data_format,
+            use_gpu=use_gpu,
+            max_err=0.003)
+
+  def testInputGradient1_2_3_4PaddingStride3x2(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=8,
+            input_cols=5,
+            filter_rows=4,
+            filter_cols=2,
+            in_depth=3,
+            out_depth=2,
+            stride_rows=3,
+            stride_cols=2,
+            padding=[[0, 0], [1, 2], [3, 4], [0, 0]],
+            test_input=True,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  def testFilterGradient1_2_3_4PaddingStride3x2(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=8,
+            input_cols=5,
+            filter_rows=4,
+            filter_cols=2,
+            in_depth=3,
+            out_depth=2,
+            stride_rows=3,
+            stride_cols=2,
+            padding=[[0, 0], [1, 2], [3, 4], [0, 0]],
+            test_input=False,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  def testInputGradient4_3_2_1PaddingStride2x1(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=3,
+            input_rows=5,
+            input_cols=7,
+            filter_rows=3,
+            filter_cols=2,
+            in_depth=1,
+            out_depth=2,
+            stride_rows=2,
+            stride_cols=1,
+            padding=[[0, 0], [4, 3], [2, 1], [0, 0]],
+            test_input=True,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  def testFilterGradient4_3_2_1PaddingStride2x1(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=3,
+            input_rows=5,
+            input_cols=7,
+            filter_rows=3,
+            filter_cols=2,
+            in_depth=1,
+            out_depth=2,
+            stride_rows=2,
+            stride_cols=1,
+            padding=[[0, 0], [4, 3], [2, 1], [0, 0]],
+            test_input=False,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  def testInputGradient0_0_0_5PaddingStride1x2(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=6,
+            input_cols=7,
+            filter_rows=3,
+            filter_cols=4,
+            in_depth=3,
+            out_depth=2,
+            stride_rows=1,
+            stride_cols=2,
+            padding=[[0, 0], [0, 0], [0, 5], [0, 0]],
+            test_input=True,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  def testFilterGradient0_0_0_5PaddingStride1x2(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=6,
+            input_cols=7,
+            filter_rows=3,
+            filter_cols=4,
+            in_depth=3,
+            out_depth=2,
+            stride_rows=1,
+            stride_cols=2,
+            padding=[[0, 0], [0, 0], [0, 5], [0, 0]],
+            test_input=False,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
   def testShapeFunctionEdgeCases(self):
     # All shapes unknown.
     c1 = nn_ops.conv2d(
@@ -1473,6 +2237,55 @@ class Conv2DTest(test.TestCase):
           strides=[1, 1, 1, 1],
           padding="SAME")
 
+    # Negative padding.
+    with self.assertRaises(ValueError):
+      nn_ops.conv2d(
+          array_ops.placeholder(dtypes.float32),
+          array_ops.placeholder(dtypes.float32),
+          strides=[1, 1, 1, 1],
+          padding=[[0, 0], [0, -1], [1, 2], [0, 0]])
+
+    # Nonzero padding in nonspatial dimension.
+    with self.assertRaises(ValueError):
+      nn_ops.conv2d(
+          array_ops.placeholder(dtypes.float32),
+          array_ops.placeholder(dtypes.float32),
+          strides=[1, 1, 1, 1],
+          padding=[[1, 0], [0, 0], [0, 0], [0, 0]])
+
+    # Nonzero NCHW padding in nonspatial dimension.
+    with self.assertRaises(ValueError):
+      nn_ops.conv2d(
+          array_ops.placeholder(dtypes.float32),
+          array_ops.placeholder(dtypes.float32),
+          strides=[1, 1, 1, 1],
+          padding=[[0, 0], [0, 1], [0, 0], [0, 0]],
+          data_format="NCHW")
+
+    # Wrong amount of padding
+    with self.assertRaises(ValueError):
+      nn_ops.conv2d(
+          array_ops.placeholder(dtypes.float32),
+          array_ops.placeholder(dtypes.float32),
+          strides=[1, 1, 1, 1],
+          padding=[[0, 0], [0, 0], [0, 0]])
+
+    # Only specify one padding amount per dimension
+    with self.assertRaises(ValueError):
+      nn_ops.conv2d(
+          array_ops.placeholder(dtypes.float32),
+          array_ops.placeholder(dtypes.float32),
+          strides=[1, 1, 1, 1],
+          padding=[[0], [0], [0], [0]])
+
+    # Explicit padding elements are not lists
+    with self.assertRaises(ValueError):
+      nn_ops.conv2d(
+          array_ops.placeholder(dtypes.float32),
+          array_ops.placeholder(dtypes.float32),
+          strides=[1, 1, 1, 1],
+          padding=[0, 0, 0, 0])
+
   def testOpEdgeCases(self):
     with self.cached_session() as sess:
       # Illegal strides.
@@ -1513,6 +2326,41 @@ class Conv2DTest(test.TestCase):
                 strides=[1, 1, 1, 1],
                 padding="VALID"))
 
+      # Filter larger than input + padding.
+      with self.assertRaisesRegexp(ValueError, "Negative dimension size"):
+        sess.run(
+            nn_ops.conv2d(
+                array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
+                array_ops.placeholder(dtypes.float32, shape=[24, 25, 3, 2]),
+                strides=[1, 1, 1, 1],
+                padding=[[0, 0], [2, 2], [2, 2], [0, 0]]))
+
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        # Negative padding during backprop.
+        with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                     "nonnegative"):
+          sess.run(
+              nn_ops.conv2d_backprop_input([32, 20, 20, 3],
+                                           array_ops.placeholder(
+                                               dtypes.float32,
+                                               shape=[18, 18, 3, 2]),
+                                           array_ops.placeholder(
+                                               dtypes.float32,
+                                               shape=[32, 3, 2, 2]),
+                                           strides=[1, 1, 1, 1],
+                                           padding=[[0, 0], [-1, 0], [0, 0],
+                                                    [0, 0]]))
+        with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                     "nonnegative"):
+          sess.run(
+              nn_ops.conv2d_backprop_filter(
+                  array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
+                  [18, 18, 3, 2],
+                  array_ops.placeholder(dtypes.float32, shape=[32, 3, 2, 2]),
+                  strides=[1, 1, 1, 1],
+                  padding=[[0, 0], [-1, 0], [0, 0], [0, 0]]))
+
 
 class DepthwiseConv2DTest(test.TestCase):
 
@@ -1546,7 +2394,7 @@ class DepthwiseConv2DTest(test.TestCase):
       conv = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
       value = self.evaluate(conv)
-    tf_logging.info("value = ", value)
+    tf_logging.debug("value = %s", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
@@ -1668,7 +2516,7 @@ class SeparableConv2DTest(test.TestCase):
         conv = array_ops.transpose(conv, [0, 2, 3, 1])
 
       value = self.evaluate(conv)
-    tf_logging.info("value = ", value)
+    tf_logging.debug("value = %s", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-3)
     self.assertShapeEqual(value, conv)
 
@@ -1828,6 +2676,194 @@ class Conv2DBenchmark(test.Benchmark):
             name="conv_stack_iter_%d" % iter_index, wall_time=wall_time)
         tf_logging.info("conv_stack_iter_%d: %.4f" % (iter_index, wall_time))
 
+  def _bench_op(self, name, op, burn_iters, num_iters):
+    config = config_pb2.ConfigProto()
+    # Prevent Grappler from optimizing away the entire graph.
+    config.graph_options.rewrite_options.dependency_optimization = (
+        rewriter_config_pb2.RewriterConfig.OFF)
+    with session_lib.Session(config=config) as session:
+      variables.global_variables_initializer().run()
+      self.run_op_benchmark(
+          session, op, burn_iters=burn_iters, min_iters=num_iters, name=name)
+
+  def benchmarkExplicitVsManualPadding(self):
+    """Compare performance of EXPLICIT padding and calling tf.pad.
+
+    A Conv2D op with EXPLICIT padding is benchmarked, and a tf.pad with the same
+    padding followed by an equivalent Conv2D op is benchmarked.
+    """
+    if not test.is_gpu_available():
+      return
+
+    with ops.Graph().as_default():
+      burn_iters = 15
+      num_iters = 300
+      batch_size = 64
+      # The input and filter correspond to the first layer of Resnet50.
+      input = variables.Variable(  # pylint: disable=redefined-builtin
+          random_ops.random_uniform([
+              batch_size,
+              3,
+              224,
+              224
+          ]))
+      filter = variables.Variable(random_ops.random_uniform([7, 7, 3, 64]))  # pylint: disable=redefined-builtin
+      strides = [1, 1, 2, 2]
+      padding = [(0, 0), (0, 0), (3, 3), (3, 3)]
+      output_explicit_pad = nn_ops.conv2d(
+          input, filter, strides, padding=padding, data_format="NCHW")
+      input_padded = array_ops.pad(input, padding)
+      output_manual_pad = nn_ops.conv2d(
+          input_padded, filter, strides, padding="VALID", data_format="NCHW")
+      # Benchmark just the forward pass.
+      self._bench_op("explicit_pad_forward", output_explicit_pad.op, burn_iters,
+                     num_iters)
+      self._bench_op("manual_pad_forward", output_manual_pad.op, burn_iters,
+                     num_iters)
+
+      # Benchmark both the forward and backwards passes.
+      input_grad_explicit_pad, filter_grad_explicit_pad = (
+          gradients_impl.gradients(output_explicit_pad, [input, filter]))
+      self._bench_op(
+          "explicit_pad_backward",
+          control_flow_ops.group(input_grad_explicit_pad,
+                                 filter_grad_explicit_pad), burn_iters,
+          num_iters)
+      input_grad_manual_pad, filter_grad_manual_pad = gradients_impl.gradients(
+          output_manual_pad, [input, filter])
+      self._bench_op(
+          "manual_pad_backward",
+          control_flow_ops.group(input_grad_manual_pad, filter_grad_manual_pad),
+          burn_iters, num_iters)
+
+  def benchmarkExplicitVsSamePaddingGraph(self):
+    """Compare performance of EXPLICIT and SAME padding in graph mode.
+
+    A Conv2D op with SAME padding is benchmarked, and an equivalent Conv2D op
+    with explicit padding is benchmarked, where the padding is the same as in
+    the SAME case. The purpose is to ensure EXPLICIT padding is just as
+    efficient as the SAME case
+    """
+    if not test.is_gpu_available():
+      return
+
+    with ops.Graph().as_default():
+      burn_iters = 15
+      num_convs = 20
+      num_iters = 50
+      batch_size = 64
+      # The input and filter correspond to a middle layer of Resnet50.
+      input = variables.Variable(  # pylint: disable=redefined-builtin
+          random_ops.random_uniform([
+              batch_size,
+              256,
+              14,
+              14
+          ]))
+      filter = variables.Variable(random_ops.random_uniform([3, 3, 256, 256]))  # pylint: disable=redefined-builtin
+      strides = [1, 1, 1, 1]
+      padding = [(0, 0), (0, 0), (1, 1), (1, 1)]
+      output_explicit_pad = input
+      output_same_pad = input
+
+      for _ in range(num_convs):
+        output_explicit_pad = nn_ops.conv2d(
+            output_explicit_pad,
+            filter,
+            strides,
+            padding=padding,
+            data_format="NCHW")
+        output_same_pad = nn_ops.conv2d(
+            output_same_pad,
+            filter,
+            strides,
+            padding="SAME",
+            data_format="NCHW")
+      grad_explicit_pad, = gradients_impl.gradients(output_explicit_pad, filter)
+      grad_same_pad, = gradients_impl.gradients(output_same_pad, filter)
+      self._bench_op("graph_explicit_pad", grad_explicit_pad.op, burn_iters,
+                     num_iters)
+      self._bench_op("graph_same_pad", grad_same_pad.op, burn_iters, num_iters)
+
+  def benchmarkExplicitVsSamePaddingEager(self):
+    """Compare performance of EXPLICIT and SAME padding in eager mode.
+
+    A Conv2D op with SAME padding is benchmarked, and an equivalent Conv2D op
+    with explicit padding is benchmarked, where the padding is the same as in
+    the SAME case. Currently, EXPLICIT padding is slightly slower, due to the
+    fact the Python padding list must be checked and processed before the Conv2D
+    op can run.
+    """
+    # TODO(reedwm): Make EXPLICIT padding as fast as SAME padding.
+    if not test.is_gpu_available():
+      return
+
+    with context.eager_mode():
+      burn_iters = 15
+      num_convs = 20
+      num_iters = 50
+      batch_size = 64
+      # The input and filter correspond to a middle layer of Resnet50.
+      input = variables.Variable(  # pylint: disable=redefined-builtin
+          random_ops.random_uniform([
+              batch_size,
+              256,
+              14,
+              14
+          ]))
+      filter = variables.Variable(random_ops.random_uniform([3, 3, 256, 256]))  # pylint: disable=redefined-builtin
+      strides = [1, 1, 1, 1]
+      padding = [(0, 0), (0, 0), (1, 1), (1, 1)]
+      output_explicit_pad = input
+      output_same_pad = input
+      for _ in range(burn_iters):
+        output_explicit_pad = nn_ops.conv2d(
+            output_explicit_pad,
+            filter,
+            strides,
+            padding=padding,
+            data_format="NCHW")
+        output_same_pad = nn_ops.conv2d(
+            output_same_pad,
+            filter,
+            strides,
+            padding="SAME",
+            data_format="NCHW")
+
+      start = time.time()
+      for _ in range(num_iters):
+        with backprop.GradientTape() as tape:
+          for _ in range(num_convs):
+            output_explicit_pad = nn_ops.conv2d(
+                output_explicit_pad,
+                filter,
+                strides,
+                padding=padding,
+                data_format="NCHW")
+          tape.gradient(output_explicit_pad, filter)
+      end = time.time()
+      self.report_benchmark(
+          name="eager_explicit_pad",
+          wall_time=(end - start) / num_iters,
+          iters=num_iters)
+
+      start = time.time()
+      for _ in range(num_iters):
+        with backprop.GradientTape() as tape:
+          for _ in range(num_convs):
+            output_same_pad = nn_ops.conv2d(
+                output_same_pad,
+                filter,
+                strides,
+                padding="SAME",
+                data_format="NCHW")
+          tape.gradient(output_same_pad, filter)
+      end = time.time()
+      self.report_benchmark(
+          name="eager_same_pad",
+          wall_time=(end - start) / num_iters,
+          iters=num_iters)
+
 
 def GetInceptionFwdTest(input_size, filter_size, stride, padding,
                         gpu_only=False):
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index 0d6a3cbd3527ac409ddf5c1c851c8993f404d029..8988305bde55804424c2627071e4e7ee2892d4db 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -200,6 +201,13 @@ class FunctionalOpsTest(test.TestCase):
     self.assertAllEqual(
         np.array([(x + 3) * 2 for x in nums]), self.evaluate(r))
 
+  def testMapDtypeEager(self):
+    with context.eager_mode():
+      dtype = functional_ops.map_fn(lambda x: constant_op.constant(""),
+                                    constant_op.constant([]),
+                                    dtype=dtypes.string).dtype
+      self.assertEqual(dtype, dtypes.string)
+
   def testMapSparseTensor(self):
     with self.cached_session():
       with self.assertRaises(TypeError):
@@ -762,6 +770,26 @@ class FunctionalOpsTest(test.TestCase):
           self.assertAllEqual(Run(sess, 20.), 210.)
           self.assertAllEqual(Run(sess, 100.), 5050.)
 
+  # Like above, but using int32 in order to ensure that int32 tensors don't get
+  # copied to the GPU during the application of the while.
+  def testWhileInt32(self):
+    with ops.Graph().as_default() as g:
+
+      @function.Defun(*[dtypes.int32] * 2)
+      def Cond(n, unused_x):
+        return n > 0
+
+      @function.Defun(*[dtypes.int32] * 2)
+      def Body(n, x):
+        return n - 1, x + n
+
+      def Run(sess, n):
+        return sess.run(functional_ops.While([n, 0], Cond, Body))[1]
+
+      with self.session(graph=g, use_gpu=True) as sess:
+        self.assertAllEqual(Run(sess, 20), 210)
+        self.assertAllEqual(Run(sess, 100), 5050)
+
   @test_util.run_deprecated_v1
   def testWhileLowering(self):
 
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index 09b9944baa1d92bfbcd484f5dba45cea28e6eafe..4b9681afd2cac5660107ca8072770f66944ec2a4 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -592,6 +592,22 @@ class LinSpaceTest(test.TestCase):
       self.assertArrayNear(self._LinSpace(5., 5., 3), np.array([5.] * 3), 1e-5)
       self.assertArrayNear(self._LinSpace(5., 5., 4), np.array([5.] * 4), 1e-5)
 
+  def testEndpointsAreExact(self):
+    for self.force_gpu in self._gpu_modes():
+      # Test some cases that produce last values not equal to "stop" when
+      # computed via start + (num - 1) * ((stop - start) / (num - 1)), since
+      # float arithmetic will introduce error through precision loss.
+      self.assertAllEqual(
+          self._LinSpace(0., 1., 42)[[0, -1]], np.array([0., 1.], np.float32))
+      self.assertAllEqual(
+          self._LinSpace(-1., 0., 42)[[0, -1]], np.array([-1., 0.], np.float32))
+      self.assertAllEqual(
+          self._LinSpace(.1, .2, 4)[[0, -1]], np.array([.1, .2], np.float32))
+      # Check a case for float64 error too.
+      self.assertAllEqual(
+          self._LinSpace(np.array(0., np.float64), .1, 12)[[0, -1]],
+          np.array([0., .1], np.float64))
+
 
 class DeviceTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
index 8e296c026c09b36afd39b891befb767a222f5f19..ec78a3ffe0b2ae1ff5c5f6c4d73480f2ad92fd26 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
@@ -30,6 +30,8 @@ _CHOLESKY_DECOMPS = linear_operator_algebra._CHOLESKY_DECOMPS
 _MATMUL = linear_operator_algebra._MATMUL
 _registered_cholesky = linear_operator_algebra._registered_cholesky
 _registered_matmul = linear_operator_algebra._registered_matmul
+_INVERSES = linear_operator_algebra._INVERSES
+_registered_inverse = linear_operator_algebra._registered_inverse
 # pylint: enable=protected-access
 
 
@@ -129,5 +131,51 @@ class MatmulTest(test.TestCase):
       self.assertEqual(v, _registered_matmul(k[0], k[1]))
 
 
+class InverseTest(test.TestCase):
+
+  def testRegistration(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+
+      def _matmul(self, a):
+        pass
+
+      def _shape(self):
+        return tensor_shape.TensorShape([1, 1])
+
+      def _shape_tensor(self):
+        pass
+
+    # Register Inverse to a lambda that spits out the name parameter
+    @linear_operator_algebra.RegisterInverse(CustomLinOp)
+    def _inverse(a):  # pylint: disable=unused-argument,unused-variable
+      return "OK"
+
+    with self.assertRaisesRegexp(ValueError, "singular"):
+      CustomLinOp(dtype=None, is_non_singular=False).inverse()
+
+    self.assertEqual("OK", CustomLinOp(
+        dtype=None, is_non_singular=True).inverse())
+
+  def testRegistrationFailures(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+      pass
+
+    with self.assertRaisesRegexp(TypeError, "must be callable"):
+      linear_operator_algebra.RegisterInverse(CustomLinOp)("blah")
+
+    # First registration is OK
+    linear_operator_algebra.RegisterInverse(CustomLinOp)(lambda a: None)
+
+    # Second registration fails
+    with self.assertRaisesRegexp(ValueError, "has already been registered"):
+      linear_operator_algebra.RegisterInverse(CustomLinOp)(lambda a: None)
+
+  def testExactRegistrationsAllMatch(self):
+    for (k, v) in _INVERSES.items():
+      self.assertEqual(v, _registered_inverse(k[0]))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
index f0cc5d709f9bfec2e3dcfadecc8f949bb6ce6e6d..96e6e3c04c77e2a32d11d72feea02c177cfa3e61 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
@@ -155,20 +155,38 @@ class SquareLinearOperatorBlockDiagTest(
         is_self_adjoint=True,
     )
     cholesky_factor = operator.cholesky()
-    self.assertTrue(isinstance(
+    self.assertIsInstance(
         cholesky_factor,
-        block_diag.LinearOperatorBlockDiag))
+        block_diag.LinearOperatorBlockDiag)
     self.assertEqual(2, len(cholesky_factor.operators))
-    self.assertTrue(
-        isinstance(
-            cholesky_factor.operators[0],
-            lower_triangular.LinearOperatorLowerTriangular)
+    self.assertIsInstance(
+        cholesky_factor.operators[0],
+        lower_triangular.LinearOperatorLowerTriangular)
+    self.assertIsInstance(
+        cholesky_factor.operators[1],
+        lower_triangular.LinearOperatorLowerTriangular
     )
-    self.assertTrue(
-        isinstance(
-            cholesky_factor.operators[1],
-            lower_triangular.LinearOperatorLowerTriangular)
+
+  def test_block_diag_inverse_type(self):
+    matrix = [[1., 0.], [0., 1.]]
+    operator = block_diag.LinearOperatorBlockDiag(
+        [
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_non_singular=True,
+            ),
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_non_singular=True,
+            ),
+        ],
+        is_non_singular=True,
     )
+    inverse = operator.inverse()
+    self.assertIsInstance(
+        inverse,
+        block_diag.LinearOperatorBlockDiag)
+    self.assertEqual(2, len(inverse.operators))
 
   def test_is_non_singular_auto_set(self):
     # Matrix with two positive eigenvalues, 11 and 8.
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
index dcbc0dd7c97184df150fc7094a28441fcfaa1257..4d7a31be87cf5f51d952704ee585d140c3147a3f 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
@@ -194,9 +194,12 @@ class LinearOperatorDiagTest(
         is_positive_definite=True,
         is_self_adjoint=True,
     )
-    self.assertTrue(isinstance(
-        operator.cholesky(),
-        linalg.LinearOperatorDiag))
+    self.assertIsInstance(operator.cholesky(), linalg.LinearOperatorDiag)
+
+  def test_diag_inverse_type(self):
+    diag = [1., 3., 5., 8.]
+    operator = linalg.LinearOperatorDiag(diag, is_non_singular=True)
+    self.assertIsInstance(operator.inverse(), linalg.LinearOperatorDiag)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
index 2da5e712d77b88ca6bb20a5f0920335f00c7b594..14b5228bca1eba85118a3982ea2fe14da175eb54 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
@@ -265,9 +265,14 @@ class LinearOperatorIdentityTest(
         is_positive_definite=True,
         is_self_adjoint=True,
     )
-    self.assertTrue(isinstance(
-        operator.cholesky(),
-        linalg_lib.LinearOperatorIdentity))
+    self.assertIsInstance(
+        operator.cholesky(), linalg_lib.LinearOperatorIdentity)
+
+  def test_identity_inverse_type(self):
+    operator = linalg_lib.LinearOperatorIdentity(
+        num_rows=2, is_non_singular=True)
+    self.assertIsInstance(
+        operator.inverse(), linalg_lib.LinearOperatorIdentity)
 
 
 class LinearOperatorScaledIdentityTest(
@@ -491,10 +496,19 @@ class LinearOperatorScaledIdentityTest(
         is_positive_definite=True,
         is_self_adjoint=True,
     )
-    self.assertTrue(isinstance(
+    self.assertIsInstance(
         operator.cholesky(),
-        linalg_lib.LinearOperatorScaledIdentity))
+        linalg_lib.LinearOperatorScaledIdentity)
 
+  def test_scaled_identity_inverse_type(self):
+    operator = linalg_lib.LinearOperatorScaledIdentity(
+        num_rows=2,
+        multiplier=3.,
+        is_non_singular=True,
+    )
+    self.assertIsInstance(
+        operator.inverse(),
+        linalg_lib.LinearOperatorScaledIdentity)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
index 513b246803233f1117b48f1a3d413be42f15238a..54ccc0c5f642ad98c04174d01d9fca0c0fc056d6 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
@@ -100,7 +100,7 @@ class SquareLinearOperatorKroneckerTest(
 
   @property
   def _tests_to_skip(self):
-    return ["det", "solve", "solve_with_broadcast"]
+    return ["det", "inverse", "solve", "solve_with_broadcast"]
 
   def _operator_and_matrix(
       self, build_info, dtype, use_placeholder,
@@ -211,20 +211,33 @@ class SquareLinearOperatorKroneckerTest(
         is_self_adjoint=True,
     )
     cholesky_factor = operator.cholesky()
-    self.assertTrue(isinstance(
+    self.assertIsInstance(
         cholesky_factor,
-        kronecker.LinearOperatorKronecker))
+        kronecker.LinearOperatorKronecker)
     self.assertEqual(2, len(cholesky_factor.operators))
-    self.assertTrue(
-        isinstance(
-            cholesky_factor.operators[0],
-            lower_triangular.LinearOperatorLowerTriangular)
-    )
-    self.assertTrue(
-        isinstance(
-            cholesky_factor.operators[1],
-            lower_triangular.LinearOperatorLowerTriangular)
+    self.assertIsInstance(
+        cholesky_factor.operators[0],
+        lower_triangular.LinearOperatorLowerTriangular)
+    self.assertIsInstance(
+        cholesky_factor.operators[1],
+        lower_triangular.LinearOperatorLowerTriangular)
+
+  def test_kronecker_inverse_type(self):
+    matrix = [[1., 0.], [0., 1.]]
+    operator = kronecker.LinearOperatorKronecker(
+        [
+            linalg.LinearOperatorFullMatrix(
+                matrix, is_non_singular=True),
+            linalg.LinearOperatorFullMatrix(
+                matrix, is_non_singular=True),
+        ],
+        is_non_singular=True,
     )
+    inverse = operator.inverse()
+    self.assertIsInstance(
+        inverse,
+        kronecker.LinearOperatorKronecker)
+    self.assertEqual(2, len(inverse.operators))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
index eb0b8ef127749e9e5709861d14b143877790bffd..10651d3c8afa0e29766d20c3dc8177af94678336 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
@@ -36,7 +36,8 @@ class LinearOperatorZerosTest(
 
   @property
   def _tests_to_skip(self):
-    return ["cholesky", "log_abs_det", "solve", "solve_with_broadcast"]
+    return [
+        "cholesky", "log_abs_det", "inverse", "solve", "solve_with_broadcast"]
 
   @property
   def _operator_build_infos(self):
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index 489f6c9b00471e6c10a8a04830613e9c5b99661a..ec6906f20c706277d3a019e0ea9e7caa3f5168e3 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -189,6 +189,47 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.evaluate(t)
 
+  def _testStackWithUninitializedTensors(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[], num_elements=3)
+    t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t), [0., 0., 0.])
+
+  def testStackWithUninitializedTensors(self):
+    self._testStackWithUninitializedTensors()
+
+  def testStackWithUninitializedTensorsGpu(self):
+    if not context.num_gpus():
+      return
+    with context.device("gpu:0"):
+      self._testStackWithUninitializedTensors()
+
+  def _testStackWithUninitializedTensorsInferShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    l = list_ops.tensor_list_set_item(l, 1, [1., 2.])
+    t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t), [[0., 0.], [1., 2.], [0., 0.]])
+
+  def testStackWithUninitializedTensorsInferShape(self):
+    self._testStackWithUninitializedTensorsInferShape()
+
+  def testStackWithUninitializedTensorsInferShapeGpu(self):
+    if not context.num_gpus():
+      return
+    with context.device("gpu:0"):
+      self._testStackWithUninitializedTensorsInferShape()
+
+  def testStackReservedListWithNoElementsAndPartialElementShapeFails(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Tried to stack list which only contains "
+                                 "uninitialized tensors and has a "
+                                 "non-fully-defined element_shape: <unknown>"):
+      t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
   @parameterized.named_parameters(("NoMaxNumElements", None),
                                   ("WithMaxNumElements", 2))
   def testGatherGrad(self, max_num_elements):
@@ -290,6 +331,47 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       t = list_ops.tensor_list_gather(l, [], element_dtype=dtypes.float32)
       self.evaluate(t)
 
+  def testGatherGradWithNonContiguousIndices(self):
+    with backprop.GradientTape(persistent=True) as tape:
+      t = constant_op.constant([1.0, 2.0, 3.0])
+      l = list_ops.tensor_list_from_tensor(t, element_shape=[])
+      c = constant_op.constant(5.0)
+      tape.watch(c)
+      l = list_ops.tensor_list_set_item(l, 1, c)
+      t = list_ops.tensor_list_gather(l, [1], element_dtype=dtypes.float32)
+      self.assertAllEqual(self.evaluate(t), [5.0])
+      s = t[0] * t[0]
+    dt = tape.gradient(s, c)
+    self.assertAllEqual(self.evaluate(dt), 10.0)
+    dl = tape.gradient(t, l)
+    dl_length = list_ops.tensor_list_length(dl)
+    self.assertAllEqual(self.evaluate(dl_length), 3)
+
+  def testScatterOutputListSize(self):
+    c0 = constant_op.constant([1.0, 2.0])
+    l = list_ops.tensor_list_scatter(
+        c0, [1, 3], ops.convert_to_tensor([], dtype=dtypes.int32))
+    # TensorListScatter should return a list with size largest index + 1.
+    self.assertEqual(self.evaluate(list_ops.tensor_list_length(l)), 4)
+
+  def testScatterWithInvalidRowsInInputTensorFails(self):
+    c0 = constant_op.constant([1.0, 2.0])
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Invalid number of rows in input tensor. Expected: 3 Actual: 2"):
+      l = list_ops.tensor_list_scatter(
+          c0, [1, 0, 2], ops.convert_to_tensor([], dtype=dtypes.int32))
+      self.evaluate(l)
+
+  def testScatterWithNegativeIndicesFails(self):
+    c0 = constant_op.constant([1.0, 2.0])
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Indices in TensorListScatter must all be positive."):
+      l = list_ops.tensor_list_scatter(
+          c0, [-1, -2], ops.convert_to_tensor([], dtype=dtypes.int32))
+      self.evaluate(l)
+
   def testScatterGrad(self):
     with backprop.GradientTape() as tape:
       c0 = constant_op.constant([1.0, 2.0])
@@ -345,6 +427,20 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.assertAllEqual(self.evaluate(e), 10.0)
     self.assertAllEqual(self.evaluate(tape.gradient(e, t)), 2.0)
 
+  @test_util.run_deprecated_v1
+  @test_util.enable_control_flow_v2
+  def testSkipEagerSetItemIndexOutOfBounds(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[])
+    e0 = constant_op.constant(5.)
+    l = list_ops.tensor_list_set_item(
+        l, 0, 2. * e0, resize_if_index_out_of_bounds=True)
+    l = list_ops.tensor_list_set_item(
+        l, 1, 1., resize_if_index_out_of_bounds=True)
+    t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+    grad = gradients_impl.gradients(t, e0)[0]
+    self.assertAllEqual(self.evaluate(grad), 2.)
+
   @test_util.run_deprecated_v1
   def testSetOnEmptyListWithMaxNumElementsFails(self):
     l = list_ops.empty_tensor_list(
@@ -1096,6 +1192,47 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                                    element_shape=[1],
                                    lengths=[1, 1])
 
+  def testResizeGrow(self):
+    l = list_ops.tensor_list_from_tensor([1., 2.], element_shape=[])
+    l = list_ops.tensor_list_resize(l, 4)
+    self.assertEqual(self.evaluate(list_ops.tensor_list_length(l)), 4)
+    self.assertEqual(
+        self.evaluate(
+            list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)),
+        1.)
+    self.assertEqual(
+        self.evaluate(
+            list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32)),
+        2.)
+
+  def testResizeShrink(self):
+    l = list_ops.tensor_list_from_tensor([1., 2., 3.], element_shape=[])
+    l = list_ops.tensor_list_resize(l, 2)
+    self.assertEqual(self.evaluate(list_ops.tensor_list_length(l)), 2)
+    self.assertAllEqual(
+        self.evaluate(
+            list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)),
+        [1., 2.])
+
+  def testResizeWithInvalidSizeFails(self):
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "TensorListSlice expects size to be non-negative"):
+      l = list_ops.tensor_list_from_tensor([1., 2., 3.], element_shape=[])
+      l = list_ops.tensor_list_resize(l, -1)
+      self.evaluate(l)
+
+  @test_util.run_deprecated_v1
+  @test_util.enable_control_flow_v2
+  def testSkipEagerResizeGrad(self):
+    t = constant_op.constant([1., 2., 3.])
+    l = list_ops.tensor_list_from_tensor(t, element_shape=[])
+    l = list_ops.tensor_list_set_item(
+        l, 3, 4., resize_if_index_out_of_bounds=True)
+    t1 = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+    grad = gradients_impl.gradients(t1, t)[0]
+    self.assertAllEqual(self.evaluate(grad), [1., 1., 1.])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/logging_ops_test.py b/tensorflow/python/kernel_tests/logging_ops_test.py
index 85035e5f7d308c323786bc9fd9017fda89dbec13..3896b138c9462250475c77ccec300a122e3b0a8c 100644
--- a/tensorflow/python/kernel_tests/logging_ops_test.py
+++ b/tensorflow/python/kernel_tests/logging_ops_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import string
 import sys
 import tempfile
 
@@ -37,6 +38,7 @@ from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
+
 class LoggingOpsTest(test.TestCase):
 
   @test_util.run_deprecated_v1
@@ -80,6 +82,17 @@ class PrintV2Test(test.TestCase):
       expected = "[0 1 2 ... 7 8 9]"
       self.assertTrue((expected + "\n") in printed.contents())
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testPrintOneStringTensor(self):
+    with self.cached_session():
+      tensor = ops.convert_to_tensor([char for char in string.ascii_lowercase])
+      with self.captureWritesToStream(sys.stderr) as printed:
+        print_op = logging_ops.print_v2(tensor)
+        self.evaluate(print_op)
+
+      expected = "[\"a\" \"b\" \"c\" ... \"x\" \"y\" \"z\"]"
+      self.assertIn((expected + "\n"), printed.contents())
+
   @test_util.run_in_graph_and_eager_modes()
   def testPrintOneTensorVarySummarize(self):
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/lookup_ops_test.py b/tensorflow/python/kernel_tests/lookup_ops_test.py
index ad81e0be649f17fe97691b1c5739dbe0bf4a63d2..3dd1ee33d91764e42e074fea87a40ad1e786b260 100644
--- a/tensorflow/python/kernel_tests/lookup_ops_test.py
+++ b/tensorflow/python/kernel_tests/lookup_ops_test.py
@@ -574,6 +574,7 @@ class KeyValueTensorInitializerTest(test.TestCase):
     with ops.Graph().as_default(), self.cached_session():
       init = lookup_ops.KeyValueTensorInitializer(
           ("brain", "salad", "surgery"), (0, 1, 2), dtypes.string, dtypes.int64)
+      self.assertEqual("", init._shared_name)
       table = lookup_ops.HashTable(init, default_value=-1)
       table.initializer.run()
 
@@ -583,6 +584,7 @@ class KeyValueTensorInitializerTest(test.TestCase):
         init1 = lookup_ops.KeyValueTensorInitializer(
             ("brain", "salad", "surgery"), (0, 1, 2), dtypes.string,
             dtypes.int64)
+        self.assertEqual("", init1._shared_name)
         table1 = lookup_ops.HashTable(init1, default_value=-1)
         self.assertEquals("hash_table", table1.name)
         self.assertEquals("table_scope/hash_table",
@@ -590,6 +592,7 @@ class KeyValueTensorInitializerTest(test.TestCase):
         init2 = lookup_ops.KeyValueTensorInitializer(
             ("brain", "salad", "surgery"), (0, 1, 2), dtypes.string,
             dtypes.int64)
+        self.assertEqual("", init2._shared_name)
         table2 = lookup_ops.HashTable(init2, default_value=-1)
         self.assertEquals("hash_table_1", table2.name)
         self.assertEquals("table_scope/hash_table_1",
@@ -599,6 +602,7 @@ class KeyValueTensorInitializerTest(test.TestCase):
     with ops.Graph().as_default(), self.cached_session():
       init = lookup_ops.KeyValueTensorInitializer((42, 1, -1000), (0, 1, 2),
                                                   dtypes.int64, dtypes.int64)
+      self.assertEqual("", init._shared_name)
       table = lookup_ops.HashTable(init, default_value=-1)
       table.initializer.run()
 
@@ -607,6 +611,7 @@ class KeyValueTensorInitializerTest(test.TestCase):
     with ops.Graph().as_default(), self.cached_session():
       init = lookup_ops.KeyValueTensorInitializer((42, 1, -1000), (0, 1, 2),
                                                   dtypes.int32, dtypes.int64)
+      self.assertEqual("", init._shared_name)
       table = lookup_ops.HashTable(init, default_value=-1)
       with self.assertRaisesRegexp(
           errors_impl.OpError, "No OpKernel was registered"):
@@ -885,10 +890,11 @@ class InitializeTableFromFileOpTest(test.TestCase):
   def testInitializeStringTable(self):
     vocabulary_file = self._createVocabFile("one_column_1.txt")
     default_value = -1
-    table = lookup_ops.HashTable(
-        lookup_ops.TextFileInitializer(
-            vocabulary_file, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
-            dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER), default_value)
+    init = lookup_ops.TextFileInitializer(
+        vocabulary_file, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+        dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
+    self.assertTrue("one_column_1.txt_-2_-1" in init._shared_name)
+    table = lookup_ops.HashTable(init, default_value)
     self.evaluate(table.initializer)
 
     output = table.lookup(constant_op.constant(["brain", "salad", "tank"]))
@@ -903,11 +909,11 @@ class InitializeTableFromFileOpTest(test.TestCase):
 
     with self.cached_session():
       default_value = -1
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(
-              vocabulary_file, dtypes.int64,
-              lookup_ops.TextFileIndex.WHOLE_LINE, dtypes.int64,
-              lookup_ops.TextFileIndex.LINE_NUMBER), default_value)
+      init = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.int64, lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
+      self.assertTrue("one_column_int64.txt_-2_-1" in init._shared_name)
+      table = lookup_ops.HashTable(init, default_value)
       table.initializer.run()
 
       output = table.lookup(
@@ -924,10 +930,11 @@ class InitializeTableFromFileOpTest(test.TestCase):
       default_value = "UNK"
       key_index = lookup_ops.TextFileIndex.LINE_NUMBER
       value_index = lookup_ops.TextFileIndex.WHOLE_LINE
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.int64,
-                                         key_index, dtypes.string, value_index),
-          default_value)
+      init = lookup_ops.TextFileInitializer(vocabulary_file, dtypes.int64,
+                                            key_index, dtypes.string,
+                                            value_index)
+      self.assertTrue("one_column_2.txt_-1_-2" in init._shared_name)
+      table = lookup_ops.HashTable(init, default_value)
       table.initializer.run()
 
       input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
@@ -947,10 +954,11 @@ class InitializeTableFromFileOpTest(test.TestCase):
       key_index = 1
       value_index = 2
 
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
-                                         key_index, dtypes.int64, value_index),
-          default_value)
+      init = lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
+                                            key_index, dtypes.int64,
+                                            value_index)
+      self.assertTrue("three_columns.txt_1_2" in init._shared_name)
+      table = lookup_ops.HashTable(init, default_value)
       table.initializer.run()
 
       input_string = constant_op.constant(["brain", "salad", "surgery"])
@@ -969,10 +977,11 @@ class InitializeTableFromFileOpTest(test.TestCase):
       default_value = -1
       key_index = 2
       value_index = 1
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
-                                         key_index, dtypes.int64, value_index),
-          default_value)
+      init = lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
+                                            key_index, dtypes.int64,
+                                            value_index)
+      self.assertTrue("three_columns.txt_2_1" in init._shared_name)
+      table = lookup_ops.HashTable(init, default_value)
       with self.assertRaisesOpError("is not a valid"):
         table.initializer.run()
 
@@ -985,10 +994,11 @@ class InitializeTableFromFileOpTest(test.TestCase):
       value_index = lookup_ops.TextFileIndex.LINE_NUMBER
 
       with self.assertRaises(ValueError):
-        lookup_ops.HashTable(
-            lookup_ops.TextFileInitializer(vocabulary_file, dtypes.int64,
-                                           key_index, dtypes.string,
-                                           value_index), default_value)
+        init = lookup_ops.TextFileInitializer(vocabulary_file, dtypes.int64,
+                                              key_index, dtypes.string,
+                                              value_index)
+        self.assertTrue("one_column_3.txt_-2_-1" in init._shared_name)
+        lookup_ops.HashTable(init, default_value)
 
   @test_util.run_deprecated_v1
   def testInvalidIndex(self):
@@ -997,10 +1007,11 @@ class InitializeTableFromFileOpTest(test.TestCase):
       default_value = -1
       key_index = 1  # second column of the line
       value_index = lookup_ops.TextFileIndex.LINE_NUMBER
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
-                                         key_index, dtypes.int64, value_index),
-          default_value)
+      init = lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
+                                            key_index, dtypes.int64,
+                                            value_index)
+      self.assertTrue("one_column_4.txt_1_-1" in init._shared_name)
+      table = lookup_ops.HashTable(init, default_value)
 
       with self.assertRaisesOpError("Invalid number of columns"):
         table.initializer.run()
@@ -1009,30 +1020,27 @@ class InitializeTableFromFileOpTest(test.TestCase):
   def testInitializeSameTableWithMultipleNodes(self):
     vocabulary_file = self._createVocabFile("one_column_5.txt")
 
-    with self.cached_session() as sess:
+    with self.cached_session():
       shared_name = "shared-one-columm"
       default_value = -1
-      table1 = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
-                                         lookup_ops.TextFileIndex.WHOLE_LINE,
-                                         dtypes.int64,
-                                         lookup_ops.TextFileIndex.LINE_NUMBER),
-          default_value,
-          shared_name=shared_name)
-      table2 = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
-                                         lookup_ops.TextFileIndex.WHOLE_LINE,
-                                         dtypes.int64,
-                                         lookup_ops.TextFileIndex.LINE_NUMBER),
-          default_value,
-          shared_name=shared_name)
-      table3 = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
-                                         lookup_ops.TextFileIndex.WHOLE_LINE,
-                                         dtypes.int64,
-                                         lookup_ops.TextFileIndex.LINE_NUMBER),
-          default_value,
-          shared_name=shared_name)
+      init1 = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
+      self.assertTrue("one_column_5.txt_-2_-1" in init1._shared_name)
+      table1 = lookup_ops.HashTable(init1, default_value,
+                                    shared_name=shared_name)
+      init2 = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
+      self.assertTrue("one_column_5.txt_-2_-1" in init2._shared_name)
+      table2 = lookup_ops.HashTable(init2, default_value,
+                                    shared_name=shared_name)
+      init3 = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
+      self.assertTrue("one_column_5.txt_-2_-1" in init3._shared_name)
+      table3 = lookup_ops.HashTable(init3, default_value,
+                                    shared_name=shared_name)
 
       lookup_ops.tables_initializer().run()
 
@@ -1063,14 +1071,12 @@ class InitializeTableFromFileOpTest(test.TestCase):
       default_value = -1
       vocab_size = 3
       vocabulary_file1 = self._createVocabFile("one_column6.txt")
-      table1 = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(
-              vocabulary_file1,
-              dtypes.string,
-              lookup_ops.TextFileIndex.WHOLE_LINE,
-              dtypes.int64,
-              lookup_ops.TextFileIndex.LINE_NUMBER,
-              vocab_size=vocab_size), default_value)
+      init1 = lookup_ops.TextFileInitializer(
+          vocabulary_file1, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER,
+          vocab_size=vocab_size)
+      self.assertTrue("one_column6.txt_3_-2_-1" in init1._shared_name)
+      table1 = lookup_ops.HashTable(init1, default_value)
 
       # Initialize from file.
       table1.initializer.run()
@@ -1078,27 +1084,23 @@ class InitializeTableFromFileOpTest(test.TestCase):
 
       vocabulary_file2 = self._createVocabFile("one_column7.txt")
       vocab_size = 5
-      table2 = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(
-              vocabulary_file2,
-              dtypes.string,
-              lookup_ops.TextFileIndex.WHOLE_LINE,
-              dtypes.int64,
-              lookup_ops.TextFileIndex.LINE_NUMBER,
-              vocab_size=vocab_size), default_value)
+      init2 = lookup_ops.TextFileInitializer(
+          vocabulary_file2, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER,
+          vocab_size=vocab_size)
+      self.assertTrue("one_column7.txt_5_-2_-1" in init2._shared_name)
+      table2 = lookup_ops.HashTable(init2, default_value)
       with self.assertRaisesOpError("Invalid vocab_size"):
         table2.initializer.run()
 
       vocab_size = 1
       vocabulary_file3 = self._createVocabFile("one_column3.txt")
-      table3 = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(
-              vocabulary_file3,
-              dtypes.string,
-              lookup_ops.TextFileIndex.WHOLE_LINE,
-              dtypes.int64,
-              lookup_ops.TextFileIndex.LINE_NUMBER,
-              vocab_size=vocab_size), default_value)
+      init3 = lookup_ops.TextFileInitializer(
+          vocabulary_file3, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER,
+          vocab_size=vocab_size)
+      self.assertTrue("one_column3.txt_1_-2_-1" in init3._shared_name)
+      table3 = lookup_ops.HashTable(init3, default_value)
 
       # Smaller vocab size reads only vocab_size records.
       table3.initializer.run()
@@ -1110,11 +1112,11 @@ class InitializeTableFromFileOpTest(test.TestCase):
 
     with self.cached_session():
       default_value = -1
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(
-              "old_file.txt", dtypes.string,
-              lookup_ops.TextFileIndex.WHOLE_LINE, dtypes.int64,
-              lookup_ops.TextFileIndex.LINE_NUMBER), default_value)
+      init = lookup_ops.TextFileInitializer(
+          "old_file.txt", dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
+      self.assertTrue("old_file.txt_-2_-1" in init._shared_name)
+      table = lookup_ops.HashTable(init, default_value)
 
       # Initialize with non existing file (old_file.txt) should fail.
       # TODO(yleon): Update message, which might change per FileSystem.
@@ -1162,9 +1164,10 @@ class InitializeTableFromFileOpTest(test.TestCase):
     with self.cached_session():
       default_value = "UNK"
       vocab_size = 3
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileStringTableInitializer(
-              vocab_file, vocab_size=vocab_size), default_value)
+      init = lookup_ops.TextFileStringTableInitializer(
+          vocab_file, vocab_size=vocab_size)
+      self.assertTrue("feat_to_id_1.txt_3_-1_-2", init._shared_name)
+      table = lookup_ops.HashTable(init, default_value)
 
       table.initializer.run()
 
@@ -1181,9 +1184,10 @@ class InitializeTableFromFileOpTest(test.TestCase):
     with self.cached_session():
       default_value = -1
       vocab_size = 3
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileIdTableInitializer(
-              vocab_file, vocab_size=vocab_size), default_value)
+      init = lookup_ops.TextFileIdTableInitializer(
+          vocab_file, vocab_size=vocab_size)
+      self.assertTrue("feat_to_id_2.txt_3_-1_-2", init._shared_name)
+      table = lookup_ops.HashTable(init, default_value)
       table.initializer.run()
 
       input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
@@ -1199,10 +1203,10 @@ class InitializeTableFromFileOpTest(test.TestCase):
     with self.cached_session():
       default_value = -1
       vocab_size = 3
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileIdTableInitializer(
-              vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
-          default_value)
+      init = lookup_ops.TextFileIdTableInitializer(
+          vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64)
+      self.assertTrue("feat_to_id_3.txt_3_-1_-2", init._shared_name)
+      table = lookup_ops.HashTable(init, default_value)
       table.initializer.run()
 
       out = table.lookup(
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 7cbcf5ed1990c7da4cd420be0d08083a200f2600..259405511951559219c5062ef9363a7d6df61690 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -286,12 +286,19 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       tmp_dir = self.get_temp_dir()
       fname = os.path.join(tmp_dir, "var.pickle")
       with open(fname, "wb") as f:
-        v = resource_variable_ops.ResourceVariable(10.0)
+        v = resource_variable_ops.ResourceVariable(
+            10.0,
+            dtype=dtypes.float16,
+            name="v")
         pickle.dump(v, f)
 
       with open(fname, "rb") as f:
-        v = pickle.load(f)
-        self.assertAllEqual(v.numpy(), 10.0)
+        new_v = pickle.load(f)
+        self.assertEqual(new_v.name, v.name)
+        self.assertEqual(new_v.shape, v.shape)
+        self.assertEqual(new_v.dtype, v.dtype)
+        self.assertEqual(new_v.trainable, v.trainable)
+        self.assertAllEqual(new_v.numpy(), v.numpy())
 
   @test_util.run_in_graph_and_eager_modes
   def testScatterDiv(self):
@@ -629,7 +636,6 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
           variable_def=other_v_def)
       self.assertTrue(other_v_prime._cached_value is not None)
 
-  @test_util.run_v1_only("b/120545219")
   def testVariableDefInitializedInstances(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v_def = resource_variable_ops.ResourceVariable(
@@ -977,13 +983,23 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
   def testScatterNdAddStateOps(self):
     with context.eager_mode():
       v = resource_variable_ops.ResourceVariable(
-          [1, 1, 1, 1, 1, 1, 1, 1], dtype=dtypes.float32, name="add")
+          [1, 2, 3, 4, 5, 6, 7, 8], dtype=dtypes.float32, name="add")
       indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
       updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
-      expected = np.array([1, 12, 1, 11, 10, 1, 1, 13])
+      expected = np.array([1, 13, 3, 14, 14, 6, 7, 20])
       state_ops.scatter_nd_add(v, indices, updates)
       self.assertAllClose(expected, v.numpy())
 
+  def testScatterNdSubStateOps(self):
+    with context.eager_mode():
+      v = resource_variable_ops.ResourceVariable(
+          [1, 2, 3, 4, 5, 6, 7, 8], dtype=dtypes.float32, name="sub")
+      indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
+      updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
+      expected = np.array([1, -9, 3, -6, -4, 6, 7, -4])
+      state_ops.scatter_nd_sub(v, indices, updates)
+      self.assertAllClose(expected, v.numpy())
+
   def testScatterUpdateCast(self):
     with context.eager_mode():
       v = resource_variable_ops.ResourceVariable([1.0, 2.0], name="update")
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 8510a08f0c96dd9ae08a2ca3e782cc7d28e86264..e1cd9e762236486fa5e9e1866dce2df46df0eaa9 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -217,7 +217,7 @@ class StatefulScatterNdTest(test.TestCase):
   def testVariableRankAdd(self):
     self._VariableRankTests(_NumpyAdd, state_ops.scatter_nd_add)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_deprecated_v1
   def testVariableRankSub(self):
     self._VariableRankTests(_NumpySub, state_ops.scatter_nd_sub)
 
diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py
index 7598991489ce6019352e19cb6c50819d91085b0d..ede12d1c83fb559f2164c0e7f46640315d0ced62 100644
--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@@ -72,11 +72,10 @@ class SparseToIndicatorTest(test_util.TensorFlowTestCase):
         constant_op.constant(val, dtype),
         constant_op.constant(shape, dtypes.int64))
 
-  @test_util.run_deprecated_v1
   def testInt32(self):
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_5x6(dtypes.int32)
-      output = sparse_ops.sparse_to_indicator(sp_input, 50).eval()
+      output = sparse_ops.sparse_to_indicator(sp_input, 50)
 
       expected_output = np.zeros((5, 50), dtype=np.bool)
       expected_trues = ((0, 0), (1, 10), (1, 13), (1, 14), (3, 32), (3, 33))
@@ -85,11 +84,10 @@ class SparseToIndicatorTest(test_util.TensorFlowTestCase):
 
       self.assertAllEqual(output, expected_output)
 
-  @test_util.run_deprecated_v1
   def testInt64(self):
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_5x6(dtypes.int64)
-      output = sparse_ops.sparse_to_indicator(sp_input, 50).eval()
+      output = sparse_ops.sparse_to_indicator(sp_input, 50)
 
       expected_output = np.zeros((5, 50), dtype=np.bool)
       expected_trues = [(0, 0), (1, 10), (1, 13), (1, 14), (3, 32), (3, 33)]
@@ -98,11 +96,10 @@ class SparseToIndicatorTest(test_util.TensorFlowTestCase):
 
       self.assertAllEqual(output, expected_output)
 
-  @test_util.run_deprecated_v1
   def testHigherRank(self):
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_2x3x4(dtypes.int64)
-      output = sparse_ops.sparse_to_indicator(sp_input, 200).eval()
+      output = sparse_ops.sparse_to_indicator(sp_input, 200)
 
       expected_output = np.zeros((2, 3, 200), dtype=np.bool)
       expected_trues = [(0, 0, 1), (0, 1, 10), (0, 1, 12), (1, 0, 103),
@@ -151,7 +148,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
   def testInt32AndFloat32(self):
     vocab_size = 50
     indices_v, values_v = self._SparseTensorValue_3x50(np.int32, np.float32)
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       for indices in (indices_v,
                       sparse_tensor.SparseTensor.from_value(indices_v)):
         for values in (values_v,
@@ -163,7 +160,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
 
   def testInt64AndFloat32(self):
     vocab_size = 50
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       indices, values = self._SparseTensor_3x50(np.int64, np.float32)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
@@ -172,7 +169,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
 
   def testInt64AndFloat64(self):
     vocab_size = 50
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       indices, values = self._SparseTensor_3x50(np.int64, np.float64)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
@@ -181,7 +178,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
 
   def testInt32AndFloat32NonCanonicalOrder(self):
     vocab_size = 50
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       indices, values = self._SparseTensor_3x50(np.int32, np.float32)
       sp_output = sparse_ops.sparse_merge(
           indices, values, vocab_size, already_sorted=True)
@@ -191,7 +188,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
 
   def testInt64AndFloat32NonCanonicalOrder(self):
     vocab_size = 50
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       indices, values = self._SparseTensor_3x50(np.int64, np.float32)
       sp_output = sparse_ops.sparse_merge(
           indices, values, vocab_size, already_sorted=True)
@@ -202,7 +199,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
   def testInt64AndFloat64NonCanonicalOrder(self):
     vocab_size = 50
     vocab_size_tensor = constant_op.constant(vocab_size, dtypes.int64)
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       indices, values = self._SparseTensor_3x50(np.int64, np.float64)
       sp_output = sparse_ops.sparse_merge(
           indices, values, vocab_size_tensor, already_sorted=True)
@@ -261,7 +258,7 @@ class SparseMergeHighDimTest(test_util.TensorFlowTestCase):
 
   def testInt64AndFloat32(self):
     vocab_size = [50, 31]
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       indices, values = self._SparseTensor_3x50(np.int64, np.float32)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
@@ -270,7 +267,7 @@ class SparseMergeHighDimTest(test_util.TensorFlowTestCase):
 
   def testInt64AndFloat64(self):
     vocab_size = [50, 31]
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       indices, values = self._SparseTensor_3x50(np.int64, np.float64)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
@@ -279,7 +276,7 @@ class SparseMergeHighDimTest(test_util.TensorFlowTestCase):
 
   def testInt64AndFloat64Shape(self):
     vocab_size = [50, 30]
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       indices, values = self._SparseTensor_3x50(np.int64, np.float64)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
@@ -300,9 +297,8 @@ class SparseRetainTest(test_util.TensorFlowTestCase):
   def _SparseTensor_5x6(self):
     return sparse_tensor.SparseTensor.from_value(self._SparseTensorValue_5x6())
 
-  @test_util.run_deprecated_v1
   def testBasic(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       for sp_input in (self._SparseTensorValue_5x6(), self._SparseTensor_5x6()):
         to_retain = np.array([1, 0, 0, 1, 1, 0], dtype=np.bool)
         sp_output = sparse_ops.sparse_retain(sp_input, to_retain)
@@ -314,7 +310,7 @@ class SparseRetainTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(output.dense_shape, [5, 6])
 
   def testRetainNone(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_5x6()
       to_retain = np.zeros((6,), dtype=np.bool)
       sp_output = sparse_ops.sparse_retain(sp_input, to_retain)
@@ -326,7 +322,7 @@ class SparseRetainTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(output.dense_shape, [5, 6])
 
   def testMismatchedRetainShape(self):
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_5x6()
       to_retain = np.array([1, 0, 0, 1, 0], dtype=np.bool)
       with self.assertRaises(ValueError):
@@ -358,16 +354,14 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
     return sparse_tensor.SparseTensorValue(self._IND_2_5_6, self._VAL_2_5_6,
                                            self._SHP_2_5_6)
 
-  @test_util.run_deprecated_v1
   def testStaticShapeInfoPreservedWhenNewShapeIsProvidedAndStatic(self):
     sp_input = self._SparseTensor_2x5x6()
     new_shape = np.array([3, 6, 7], dtype=np.int64)
     sp_output = sparse_ops.sparse_reset_shape(sp_input, new_shape)
     self.assertAllEqual([3, 6, 7], sp_output.get_shape())
 
-  @test_util.run_deprecated_v1
   def testBasic(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_2x5x6()
       new_shape = np.array([3, 6, 7], dtype=np.int64)
       sp_output = sparse_ops.sparse_reset_shape(sp_input, new_shape)
@@ -379,9 +373,8 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(output.values, [0, 10, 13, 14, 32, 33])
       self.assertAllEqual(output.dense_shape, [3, 6, 7])
 
-  @test_util.run_deprecated_v1
   def testInputUnavailableInGraphConstructionOk(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_input = self._SparseTensorValue_2x5x6()
       new_shape = np.array([3, 6, 7], dtype=np.int64)
       sp_output = sparse_ops.sparse_reset_shape(sp_input, new_shape)
@@ -409,7 +402,7 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(output.dense_shape, [3, 6, 7])
 
   def testTightBoundingBox(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_2x5x6()
       sp_output = sparse_ops.sparse_reset_shape(sp_input)
 
@@ -421,7 +414,7 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(output.dense_shape, [2, 4, 5])
 
   def testTightBoundingBoxEmpty(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_2x5x6_Empty()
       sp_output = sparse_ops.sparse_reset_shape(sp_input)
 
@@ -431,9 +424,8 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(output.values.shape, [0])
       self.assertAllEqual(output.dense_shape, [0, 0, 0])
 
-  @test_util.run_deprecated_v1
   def testInvalidRank(self):
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_2x5x6()
       new_shape = np.array([3, 7], dtype=np.int64)
 
@@ -450,7 +442,6 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       with self.assertRaisesOpError("x == y did not hold element-wise"):
         sess.run(out, feed_dict={new_shape: np.array([3, 7], dtype=np.int64)})
 
-  @test_util.run_deprecated_v1
   def testInvalidDimensionSizeStatic(self):
     sp_input = self._SparseTensor_2x5x6()
     new_shape = np.array([3, 7, 5], dtype=np.int64)
@@ -510,14 +501,13 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
         constant_op.constant(val, dtypes.int32),
         constant_op.constant(shape, dtypes.int64))
 
-  @test_util.run_deprecated_v1
   def testFillNumber(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       for sp_input in (self._SparseTensorValue_5x6(), self._SparseTensor_5x6()):
         sp_output, empty_row_indicator = (
             sparse_ops.sparse_fill_empty_rows(sp_input, -1))
 
-        output, empty_row_indicator_out = sess.run(
+        output, empty_row_indicator_out = self.evaluate(
             [sp_output, empty_row_indicator])
 
         self.assertAllEqual(
@@ -530,7 +520,7 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
   def testFillFloat(self):
-    with self.session(use_gpu=False) as sess:
+    with self.session(use_gpu=False):
       values = constant_op.constant(
           [0.0, 10.0, 13.0, 14.0, 32.0, 33.0], dtype=dtypes.float64)
       default_value = constant_op.constant(-1.0, dtype=dtypes.float64)
@@ -540,7 +530,7 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
           dense_shape=np.array([5, 6]))
       sp_output, empty_row_indicator = (sparse_ops.sparse_fill_empty_rows(
           sp_input, default_value))
-      output, empty_row_indicator_out = sess.run(
+      output, empty_row_indicator_out = self.evaluate(
           [sp_output, empty_row_indicator])
 
       self.assertAllEqual(output.indices, [[0, 0], [1, 0], [1, 3], [1, 4],
@@ -563,14 +553,13 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
       self.assertGreater(default_value_grad_err, 0)
       self.assertLess(default_value_grad_err, 1e-8)
 
-  @test_util.run_deprecated_v1
   def testFillString(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_String5x6()
       sp_output, empty_row_indicator = (
           sparse_ops.sparse_fill_empty_rows(sp_input, ""))
 
-      output, empty_row_indicator_out = sess.run(
+      output, empty_row_indicator_out = self.evaluate(
           [sp_output, empty_row_indicator])
 
       self.assertAllEqual(
@@ -582,14 +571,13 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(empty_row_indicator_out,
                           np.array([0, 0, 1, 0, 1]).astype(np.bool))
 
-  @test_util.run_deprecated_v1
   def testNoEmptyRows(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_2x6()
       sp_output, empty_row_indicator = (
           sparse_ops.sparse_fill_empty_rows(sp_input, -1))
 
-      output, empty_row_indicator_out = sess.run(
+      output, empty_row_indicator_out = self.evaluate(
           [sp_output, empty_row_indicator])
 
       self.assertAllEqual(output.indices, [[0, 0], [1, 0], [1, 3], [1, 4]])
@@ -600,7 +588,6 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
 
 class SparseAddTest(test_util.TensorFlowTestCase):
 
-  @test_util.run_deprecated_v1
   def testValuesInVariable(self):
     indices = constant_op.constant([[1]], dtype=dtypes.int64)
     values = variables.Variable([1], trainable=False, dtype=dtypes.float32)
@@ -609,7 +596,7 @@ class SparseAddTest(test_util.TensorFlowTestCase):
     sp_input = sparse_tensor.SparseTensor(indices, values, shape)
     sp_output = sparse_ops.sparse_add(sp_input, sp_input)
 
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       self.evaluate(variables.global_variables_initializer())
       output = self.evaluate(sp_output)
       self.assertAllEqual(output.values, [2])
@@ -625,7 +612,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
   dense_shape = np.array([2, 3]).astype(np.int64)
 
   def _compare(self, sp_t, reduction_axes, ndims, keep_dims, do_sum):
-    densified = sparse_ops.sparse_tensor_to_dense(sp_t).eval()
+    densified = self.evaluate(sparse_ops.sparse_tensor_to_dense(sp_t))
 
     np_ans = densified
     if reduction_axes is None:
@@ -665,7 +652,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
                                                             reduction_axes,
                                                             keep_dims)
       # Convert to dense for comparison purposes.
-      out_sparse = sparse_ops.sparse_tensor_to_dense(tf_sparse_ans).eval()
+      out_sparse = sparse_ops.sparse_tensor_to_dense(tf_sparse_ans)
 
     self.assertAllClose(np_ans, out_dense)
     self.assertAllClose(np_ans, out_sparse)
@@ -676,14 +663,13 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
     self._compare(sp_t, reduction_axes, ndims, True, False)
     self._compare(sp_t, reduction_axes, ndims, True, True)
 
-  @test_util.run_deprecated_v1
   def testSimpleAndRandomInputs(self):
     if np.__version__ == "1.13.0":
       self.skipTest("numpy 1.13.0 bug")
 
     sp_t = sparse_tensor.SparseTensor(self.ind, self.vals, self.dense_shape)
 
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       self._compare_all(sp_t, None, ndims=2)
       self._compare_all(sp_t, 0, ndims=2)
       self._compare_all(sp_t, [1], ndims=2)
@@ -694,7 +680,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
 
     np.random.seed(1618)
     test_dims = [(1618, 1, 11, 7, 1), (1,), (1, 1, 1)]
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       for dims in test_dims:
         sp_t, unused_nnz = _sparsify(np.random.randn(*dims))
         # reduce all using None
@@ -706,15 +692,15 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
 
   def testInvalidAxes(self):
     sp_t = sparse_tensor.SparseTensor(self.ind, self.vals, self.dense_shape)
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       with self.assertRaisesOpError("Invalid reduction dimension -3"):
-        sparse_ops.sparse_reduce_sum(sp_t, -3).eval()
+        self.evaluate(sparse_ops.sparse_reduce_sum(sp_t, -3))
       with self.assertRaisesOpError("Invalid reduction dimension 2"):
-        sparse_ops.sparse_reduce_sum(sp_t, 2).eval()
+        self.evaluate(sparse_ops.sparse_reduce_sum(sp_t, 2))
       with self.assertRaisesOpError("Invalid reduction dimension -3"):
-        sparse_ops.sparse_reduce_max(sp_t, -3).eval()
+        self.evaluate(sparse_ops.sparse_reduce_max(sp_t, -3))
       with self.assertRaisesOpError("Invalid reduction dimension 2"):
-        sparse_ops.sparse_reduce_max(sp_t, 2).eval()
+        self.evaluate(sparse_ops.sparse_reduce_max(sp_t, 2))
 
   @test_util.run_deprecated_v1
   def testGradient(self):
@@ -745,7 +731,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
 
   def _testSparseReduceShape(self, sp_t, reduction_axes, ndims, keep_dims,
                              do_sum):
-    densified = sparse_ops.sparse_tensor_to_dense(sp_t).eval()
+    densified = self.evaluate(sparse_ops.sparse_tensor_to_dense(sp_t))
 
     np_op = np.sum
     tf_op = sparse_ops.sparse_reduce_sum
@@ -773,7 +759,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
   def testSparseReduceSumOrMaxShape(self):
     sp_t = sparse_tensor.SparseTensor(self.ind, self.vals, self.dense_shape)
 
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       for do_sum in [True, False]:
         for keep_dims in [True, False]:
           self._testSparseReduceShape(sp_t, None, 2, keep_dims, do_sum)
@@ -790,19 +776,17 @@ class SparseMathOpsTest(test_util.TensorFlowTestCase):
   def _check(self, result_tensor, result_np, input_sp_t):
     self.assertTrue(isinstance(result_tensor, sparse_tensor.SparseTensor))
     self.assertTrue(isinstance(input_sp_t, sparse_tensor.SparseTensor))
-    self.assertAllEqual(input_sp_t.indices.eval(), result_tensor.indices.eval())
-    self.assertAllEqual(input_sp_t.dense_shape.eval(),
-                        result_tensor.dense_shape.eval())
+    self.assertAllEqual(input_sp_t.indices, result_tensor.indices)
+    self.assertAllEqual(input_sp_t.dense_shape, result_tensor.dense_shape)
 
-    res_densified = sparse_ops.sparse_to_dense(result_tensor.indices,
-                                               result_tensor.dense_shape,
-                                               result_tensor.values).eval()
+    res_densified = sparse_ops.sparse_to_dense(
+        result_tensor.indices, result_tensor.dense_shape, result_tensor.values)
     self.assertAllEqual(result_np, res_densified)
 
   @test_util.run_deprecated_v1
   def testCwiseShapeValidation(self):
     # Test case for GitHub 24072.
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       a = array_ops.ones([3, 4, 1], dtype=dtypes.int32)
       b = sparse_tensor.SparseTensor([[0, 0, 1, 0], [0, 0, 3, 0]], [10, 20],
                                      [1, 1, 4, 2])
@@ -810,21 +794,20 @@ class SparseMathOpsTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
           "broadcasts dense to sparse only; got incompatible shapes"):
-        c.eval()
+        self.evaluate(c)
 
-  @test_util.run_deprecated_v1
   def testCwiseDivAndMul(self):
     np.random.seed(1618)
     sp_shapes = [(10, 10, 10), (5, 5), (1618,), (3, 3, 7)]
     dense_shapes = [(10, 10, 1), (5, 5), (1,), (1, 7)]
 
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       for dtype in [np.float32, np.float64, np.int32, np.int64]:
         for sp_shape, dense_shape in zip(sp_shapes, dense_shapes):
           sp_vals_np = np.random.rand(*sp_shape).astype(dtype) + 1
           dense_vals_np = np.random.rand(*dense_shape).astype(dtype) + 1
           sp_t, unused_nnz = _sparsify(sp_vals_np, thresh=1.5)
-          sp_t_densified = sparse_ops.sparse_tensor_to_dense(sp_t).eval()
+          sp_t_densified = sparse_ops.sparse_tensor_to_dense(sp_t)
           dense_t = constant_op.constant(dense_vals_np)
 
           self._check(sp_t / dense_t, sp_t_densified / dense_vals_np, sp_t)
@@ -834,11 +817,10 @@ class SparseMathOpsTest(test_util.TensorFlowTestCase):
 
           if dtype in [np.int32, np.int64]:
             res = sp_t / dense_t  # should invoke "__truediv__"
-            self.assertEqual(res.values.eval().dtype, np.float64)
+            self.assertEqual(res.values.dtype, np.float64)
 
-  @test_util.run_deprecated_v1
   def testCwiseAdd(self):
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       # Identity(2) + AllOnes(2,2).  Should be equal to 2 * Identity(2).
       indices = [[0, 0], [1, 1]]
       vals = [1, 1]
@@ -901,16 +883,15 @@ class SparseSoftmaxTest(test_util.TensorFlowTestCase):
       batched_sp_t, unused_nnz1 = _sparsify(
           sp_vals_np.reshape((1, n, m)), thresh=0.)  # No masking.
 
-      with self.cached_session(use_gpu=False):
+      with test_util.force_cpu():
         densified = constant_op.constant(sp_vals_np)
 
-        sp_result = sparse_ops.sparse_softmax(batched_sp_t).eval(
-        ).values.reshape((n, m))
+        sp_result = self.evaluate(
+            sparse_ops.sparse_softmax(batched_sp_t)).values.reshape((n, m))
         dense_result = nn_ops.softmax(densified)
 
-        self.assertAllClose(dense_result.eval(), sp_result)
+        self.assertAllClose(dense_result, sp_result)
 
-  @test_util.run_deprecated_v1
   def testHigherRanks(self):
     # For the first shape:
     # First batch:
@@ -933,11 +914,11 @@ class SparseSoftmaxTest(test_util.TensorFlowTestCase):
       sp_t, unused_nnz = _sparsify(values, thresh=1e-2)
       expected_values = [1., 1., 1., .5, .5]
 
-      with self.cached_session(use_gpu=False):
-        result = sparse_ops.sparse_softmax(sp_t).eval()
+      with test_util.force_cpu():
+        result = sparse_ops.sparse_softmax(sp_t)
 
         self.assertAllEqual(expected_values, result.values)
-        self.assertAllEqual(sp_t.indices.eval(), result.indices)
+        self.assertAllEqual(sp_t.indices, result.indices)
         self.assertAllEqual(shape, result.dense_shape)
 
   @test_util.run_deprecated_v1
@@ -960,25 +941,24 @@ class SparseMinimumMaximumTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(a.values, b.values)
     self.assertAllEqual(a.dense_shape, b.dense_shape)
 
-  @test_util.run_deprecated_v1
   def testBasic(self):
-    with self.cached_session(use_gpu=False):
+    with test_util.force_cpu():
       # 1-D, values at index 0.
       sp_zero = sparse_tensor.SparseTensor([[0]], [0], [7])
       sp_one = sparse_tensor.SparseTensor([[0]], [1], [7])
-      max_tf = sparse_ops.sparse_maximum(sp_zero, sp_one).eval()
-      min_tf = sparse_ops.sparse_minimum(sp_zero, sp_one).eval()
-      self._assertSparseTensorValueEqual(sp_one.eval(), max_tf)
-      self._assertSparseTensorValueEqual(sp_zero.eval(), min_tf)
+      max_tf = sparse_ops.sparse_maximum(sp_zero, sp_one)
+      min_tf = sparse_ops.sparse_minimum(sp_zero, sp_one)
+      self._assertSparseTensorValueEqual(sp_one, max_tf)
+      self._assertSparseTensorValueEqual(sp_zero, min_tf)
 
       # Values at different indices.
       sp_zero = sparse_tensor.SparseTensor([[0]], [0], [7])
       sp_zero_2 = sparse_tensor.SparseTensor([[1]], [0], [7])
       expected = sparse_tensor.SparseTensor([[0], [1]], [0, 0], [7])
-      max_tf = sparse_ops.sparse_maximum(sp_zero, sp_zero_2).eval()
-      min_tf = sparse_ops.sparse_minimum(sp_zero, sp_zero_2).eval()
-      self._assertSparseTensorValueEqual(expected.eval(), max_tf)
-      self._assertSparseTensorValueEqual(expected.eval(), min_tf)
+      max_tf = sparse_ops.sparse_maximum(sp_zero, sp_zero_2)
+      min_tf = sparse_ops.sparse_minimum(sp_zero, sp_zero_2)
+      self._assertSparseTensorValueEqual(expected, max_tf)
+      self._assertSparseTensorValueEqual(expected, min_tf)
 
   @test_util.run_deprecated_v1
   def testRandom(self):
@@ -1008,37 +988,36 @@ class SparseMinimumMaximumTest(test_util.TensorFlowTestCase):
             np.minimum(a_densified, b_densified), minimum_tf_densified)
 
   def testMismatchedShapes(self):
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       sp_zero = sparse_tensor.SparseTensor([[0, 0]], [0], [1, 1])
       sp_one = sparse_tensor.SparseTensor([[0]], [1], [2])
       with self.assertRaisesOpError("Operands do not have the same ranks"):
-        sparse_ops.sparse_maximum(sp_zero, sp_one).eval()
+        self.evaluate(sparse_ops.sparse_maximum(sp_zero, sp_one))
 
       sp_zero = sparse_tensor.SparseTensor([[0]], [0], [1])
       sp_one = sparse_tensor.SparseTensor([[0]], [1], [2])
       with self.assertRaisesOpError("Operands' shapes do not match"):
-        sparse_ops.sparse_maximum(sp_zero, sp_one).eval()
+        self.evaluate(sparse_ops.sparse_maximum(sp_zero, sp_one))
 
 
 class SparseTransposeTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def testTranspose(self):
     if np.__version__ == "1.13.0":
       self.skipTest("numpy 1.13.0 bug")
 
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       np.random.seed(1618)
       shapes = [np.random.randint(1, 10, size=rank) for rank in range(1, 6)]
       for shape in shapes:
         for dtype in [np.int32, np.int64, np.float32, np.float64]:
           dn_input = np.random.randn(*shape).astype(dtype)
-          rank = array_ops.rank(dn_input).eval()
+          rank = self.evaluate(array_ops.rank(dn_input))
           perm = np.random.choice(rank, rank, False)
           sp_input, unused_a_nnz = _sparsify(dn_input)
           sp_trans = sparse_ops.sparse_transpose(sp_input, perm=perm)
-          dn_trans = sparse_ops.sparse_tensor_to_dense(sp_trans).eval()
-          expected_trans = array_ops.transpose(dn_input, perm=perm).eval()
+          dn_trans = sparse_ops.sparse_tensor_to_dense(sp_trans)
+          expected_trans = array_ops.transpose(dn_input, perm=perm)
           self.assertAllEqual(expected_trans.shape, sp_trans.get_shape())
           self.assertAllEqual(dn_trans, expected_trans)
 
diff --git a/tensorflow/python/kernel_tests/template_test.py b/tensorflow/python/kernel_tests/template_test.py
index 3b2a56bd1ff6ef81ae17773fd5a23bc96778ce63..f587a7ec4329a1b9a4df5bbfb3d8edcc1773cbcb 100644
--- a/tensorflow/python/kernel_tests/template_test.py
+++ b/tensorflow/python/kernel_tests/template_test.py
@@ -160,6 +160,21 @@ class TemplateTest(test.TestCase):
     self.assertEqual(1, len(result))
     self.assertNotEqual(len(first), len(result))
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_template_with_empty_name(self):
+    tpl = template.make_template("", variable_scoped_function)
+    with variable_scope.variable_scope("outer"):
+      x = variable_scope.get_variable("x", [])
+      v = tpl()
+    self.assertEqual("outer/", tpl.variable_scope_name)
+    self.assertEqual("outer//dummy:0", v.name)
+    if context.executing_eagerly():
+      # In eager mode `x` is not visible to the template since the template does
+      # not rely on global collections.
+      self.assertEqual([v], tpl.variables)
+    else:
+      self.assertEqual([x, v], tpl.variables)
+
   @test_util.run_in_graph_and_eager_modes
   def test_template_with_name(self):
     tmpl1 = template.make_template("s1", variable_scoped_function)
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 147e7fde5793d4ac0b85696715aa7645f8e79bb2..e8af998a7049c051f33d91a7a2a79bf8b92d9b33 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -185,8 +185,8 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([[0.0, 0.0], [4.0, 5.0], [0.0, 0.0]],
                           self.evaluate(ta.write(1, [[4.0, 5.0]]).concat()))
 
-  @test_util.disable_control_flow_v2("b/118890905")
-  @test_util.run_v1_only("b/118890905")
+  @test_util.disable_control_flow_v2("b/122324791")
+  @test_util.run_v1_only("b/122324791")
   def testTensorArrayReadOrPackNotAllValuesAvailableFillsZeros(self):
     self._testTensorArrayReadOrPackNotAllValuesAvailableFillsZeros()
 
@@ -202,8 +202,8 @@ class TensorArrayTest(test.TestCase):
     self.assertAllEqual([[0.0, 0.0], [4.0, 5.0], [0.0, 0.0]],
                         self.evaluate(ta.write(1, [[4.0, 5.0]]).concat()))
 
-  @test_util.disable_control_flow_v2("b/118890905")
-  @test_util.run_v1_only("b/118890905")
+  @test_util.disable_control_flow_v2("b/122324791")
+  @test_util.run_v1_only("b/122324791")
   def testTensorArrayReadOrPackNotAllValuesAvailableInferShapeFillsZeros(self):
     self._testTensorArrayReadOrPackNotAllValuesAvailableInferShapeFillsZeros()
 
@@ -1000,13 +1000,11 @@ class TensorArrayTest(test.TestCase):
     # self._testWhileLoopWritePackGradients(
     #     dynamic_size=False, dtype=tf.int64)
 
-  @test_util.disable_control_flow_v2("b/117943489 (dynamic_size)")
   @test_util.run_v1_only("b/117943489")
   def testSkipEagerWhileLoopDynamicWritePackGradients(self):
     self._testWhileLoopWritePackGradients(
         dynamic_size=True, dtype=dtypes.float32)
 
-  @test_util.disable_control_flow_v2("b/119323158")
   def testGradSerialTwoLoops(self):
     with self.session(use_gpu=True):
       def loop(x):
@@ -1207,11 +1205,14 @@ class TensorArrayTest(test.TestCase):
       c1 = constant_op.constant([4.0, 5.0])
       w1 = w0.write(3, c1)
 
-      with self.assertRaisesOpError(
-          r"Could not read index 0 twice because it was cleared after a "
-          r"previous read \(perhaps try setting clear_after_read = false\?\)"):
-        with ops.control_dependencies([r0]):
-          self.evaluate(w1.read(0))
+      if not control_flow_util.ENABLE_CONTROL_FLOW_V2:
+        # TensorArray v2 does not support clear_after_read.
+        with self.assertRaisesOpError(
+            r"Could not read index 0 twice because it was cleared after a "
+            r"previous read \(perhaps try setting clear_after_read = false\?\)"
+        ):
+          with ops.control_dependencies([r0]):
+            self.evaluate(w1.read(0))
 
       r1 = w1.read(1)
       self.assertAllEqual(c1.get_shape(), r1.shape)
@@ -1220,7 +1221,6 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaises(ValueError):
         w1.write(4, c2)
 
-  @test_util.disable_control_flow_v2("b/117943489 (dynamic_size)")
   @test_util.run_v1_only("b/117943489")
   def testUnpackShape(self):
     self._testUnpackShape()
@@ -1300,12 +1300,10 @@ class TensorArrayTest(test.TestCase):
       grad = gradients_impl.gradients(ys=[r], xs=[x])
       self.assertAllEqual(np.array([1.0, 1.0, 1.0]), self.evaluate(grad)[0])
 
-  @test_util.disable_control_flow_v2("b/117943489")
   @test_util.run_v1_only("b/117943489")
   def testSkipEagerTensorArrayUnpackDynamic(self):
     self._testTensorArrayUnpackDynamic()
 
-  @test_util.disable_control_flow_v2("b/117943489")
   @test_util.run_v1_only("b/117943489")
   def testSkipEagerTensorArraySplitDynamic(self):
     with self.session(use_gpu=True) as sess:
@@ -1323,8 +1321,8 @@ class TensorArrayTest(test.TestCase):
     with self.cached_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, size=0, dynamic_size=False, infer_shape=False)
-      v2_msg = ("Tried to stack elements of a empty list with "
-                "non-fully-defined shape")
+      v2_msg = ("Tried to stack elements of a empty list with non-fully-defined"
+                " element_shape")
       v1_msg = (
           "TensorArray has size zero, but element shape <unknown> is not "
           "fully defined. Currently only static shapes are supported when "
@@ -1345,7 +1343,10 @@ class TensorArrayTest(test.TestCase):
           dtype=dtypes.float32, size=0, dynamic_size=False, infer_shape=True)
       self.assertEqual(0, ta.size().eval())
       # Don't actually perform the pack.  This stores the static shape.
-      ta.unstack(array_ops.zeros([0, 3, 5])).mark_used()
+      if control_flow_util.ENABLE_CONTROL_FLOW_V2:
+        ta = ta.unstack(array_ops.zeros([0, 3, 5]))
+      else:
+        ta.unstack(array_ops.zeros([0, 3, 5])).mark_used()
       packed = ta.stack()
       concatenated = ta.concat()
       self.assertAllEqual([0, 3, 5], self.evaluate(packed).shape)
@@ -1353,12 +1354,10 @@ class TensorArrayTest(test.TestCase):
       # first dimension of zero
       self.assertAllEqual([0, 5], self.evaluate(concatenated).shape)
 
-  @test_util.disable_control_flow_v2("b/117943489")
   @test_util.run_v1_only("b/117943489")
   def testSkipEagerTensorArrayEvalEmptyWithDefault(self):
     self._testTensorArrayEvalEmptyWithDefault()
 
-  @test_util.disable_control_flow_v2("b/117943489")
   @test_util.run_v1_only("b/117943489")
   def testSkipEagerTensorArrayScatterReadAndGradients(self):
     with self.session(use_gpu=True) as session:
@@ -1386,8 +1385,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([10.0, -10.0], read_vals[1])
       self.assertAllEqual([[2.0, 3.0], [4.0, 5.0]], grad_vals[0])
 
-  @test_util.disable_control_flow_v2("b/117943286")
-  @test_util.run_v1_only("b/117943286")
+  @test_util.run_v1_only("b/118890905")
   def testTensorArrayWriteGatherAndGradients(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 81ea47346646519c66cd04e6b99919fd941ddaed..81558db04f0a498866a206e8dba540ceec0ab0d5 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -430,6 +430,7 @@ def size_internal(input, name=None, optimize=True, out_type=dtypes.int32):
 
 
 @tf_export("rank")
+@dispatch.add_dispatch_support
 def rank(input, name=None):
   # pylint: disable=redefined-builtin
   """Returns the rank of a tensor.
@@ -3190,7 +3191,7 @@ def where(condition, x=None, y=None, name=None):
 
   Returns:
     A `Tensor` with the same type and shape as `x`, `y` if they are non-None.
-    A `Tensor` with shape `(num_true, dim_size(condition))`.
+    Otherwise, a `Tensor` with shape `(num_true, rank(condition))`.
 
   Raises:
     ValueError: When exactly one of `x` or `y` is non-None.
diff --git a/tensorflow/python/ops/cond_v2.py b/tensorflow/python/ops/cond_v2.py
index 7d09e32e241d55f064239bbfd4c4af45ac329c4b..5dca8e501b2ba12ce9bbd8768a3fab16cbf79f74 100644
--- a/tensorflow/python/ops/cond_v2.py
+++ b/tensorflow/python/ops/cond_v2.py
@@ -68,14 +68,14 @@ def cond_v2(pred, true_fn, false_fn, name="cond"):
         true_name,
         true_fn, [], {},
         func_graph=util.CondBranchFuncGraph(
-            true_name, read_only_collections=False),
+            true_name, collections=ops.get_default_graph()._collections),  # pylint: disable=protected-access
         add_control_dependencies=add_control_dependencies,
         op_return_value=pred)
     false_graph = func_graph_module.func_graph_from_py_func(
         false_name,
         false_fn, [], {},
         func_graph=util.CondBranchFuncGraph(
-            false_name, read_only_collections=False),
+            false_name, collections=ops.get_default_graph()._collections),  # pylint: disable=protected-access
         add_control_dependencies=add_control_dependencies,
         op_return_value=pred)
 
@@ -554,7 +554,8 @@ class _CondGradFuncGraph(util.CondBranchFuncGraph):
   """
 
   def __init__(self, name, forward_graph):
-    super(_CondGradFuncGraph, self).__init__(name, read_only_collections=False)
+    super(_CondGradFuncGraph, self).__init__(
+        name, collections=ops.get_default_graph()._collections)  # pylint: disable=protected-access
     self.if_op_needs_rewrite = False
     self._forward_graph = forward_graph
     # Maps from forward intermediate tensor -> the unwrapped captured
diff --git a/tensorflow/python/ops/control_flow_util.py b/tensorflow/python/ops/control_flow_util.py
index 8f5442da5e402173511163ae821b5aafab2fb9d8..ff0dff0042e409cc12131ca4e97731a210c6203b 100644
--- a/tensorflow/python/ops/control_flow_util.py
+++ b/tensorflow/python/ops/control_flow_util.py
@@ -57,6 +57,15 @@ def InXlaContext(graph):
   return GetContainingXLAContext(ctxt) is not None
 
 
+def GraphOrParentsInXlaContext(graph):
+  while True:
+    if InXlaContext(graph): return True
+    try:
+      graph = graph.outer_graph
+    except AttributeError:
+      return False
+
+
 def IsInWhileLoop(op):
   ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
   return GetContainingWhileContext(ctxt) is not None
diff --git a/tensorflow/python/ops/control_flow_util_v2.py b/tensorflow/python/ops/control_flow_util_v2.py
index 5f56850884a5e9e424c77515406ef8c9b513e972..58917ad264a56578bb4c98ff9a3ef0b63a3cbf12 100644
--- a/tensorflow/python/ops/control_flow_util_v2.py
+++ b/tensorflow/python/ops/control_flow_util_v2.py
@@ -114,7 +114,7 @@ def maybe_set_lowering_attr(op):
   Args:
     op: An `If` or `While` Operation.
   """
-  if (not control_flow_util.IsInXLAContext(op) and
+  if (not control_flow_util.GraphOrParentsInXlaContext(op.graph) and
       context.context().get_function_call_options().executor_type
       != "SINGLE_THREADED_EXECUTOR"):
     # pylint: disable=protected-access
diff --git a/tensorflow/python/ops/linalg/inverse_registrations.py b/tensorflow/python/ops/linalg/inverse_registrations.py
new file mode 100644
index 0000000000000000000000000000000000000000..b89bdd240edf7c27ae759665052f5b3c5e38d9d5
--- /dev/null
+++ b/tensorflow/python/ops/linalg/inverse_registrations.py
@@ -0,0 +1,108 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Registrations for LinearOperator.inverse."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_algebra
+from tensorflow.python.ops.linalg import linear_operator_block_diag
+from tensorflow.python.ops.linalg import linear_operator_circulant
+from tensorflow.python.ops.linalg import linear_operator_diag
+from tensorflow.python.ops.linalg import linear_operator_identity
+from tensorflow.python.ops.linalg import linear_operator_inversion
+from tensorflow.python.ops.linalg import linear_operator_kronecker
+
+
+# By default, return LinearOperatorInversion which switched the .matmul
+# and .solve methods.
+@linear_operator_algebra.RegisterInverse(linear_operator.LinearOperator)
+def _inverse_linear_operator(linop):
+  return linear_operator_inversion.LinearOperatorInversion(
+      linop,
+      is_non_singular=linop.is_non_singular,
+      is_self_adjoint=linop.is_self_adjoint,
+      is_positive_definite=linop.is_positive_definite,
+      is_square=linop.is_square)
+
+
+@linear_operator_algebra.RegisterInverse(
+    linear_operator_diag.LinearOperatorDiag)
+def _inverse_diag(diag_operator):
+  return linear_operator_diag.LinearOperatorDiag(
+      1. / diag_operator.diag,
+      is_non_singular=diag_operator.is_non_singular,
+      is_self_adjoint=diag_operator.is_self_adjoint,
+      is_positive_definite=diag_operator.is_positive_definite,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterInverse(
+    linear_operator_identity.LinearOperatorIdentity)
+def _inverse_identity(identity_operator):
+  return identity_operator
+
+
+@linear_operator_algebra.RegisterInverse(
+    linear_operator_identity.LinearOperatorScaledIdentity)
+def _inverse_scaled_identity(identity_operator):
+  return linear_operator_identity.LinearOperatorScaledIdentity(
+      num_rows=identity_operator._num_rows,  # pylint: disable=protected-access
+      multiplier=1. / identity_operator.multiplier,
+      is_non_singular=identity_operator.is_non_singular,
+      is_self_adjoint=True,
+      is_positive_definite=identity_operator.is_positive_definite,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterInverse(
+    linear_operator_block_diag.LinearOperatorBlockDiag)
+def _inverse_block_diag(block_diag_operator):
+    # We take the inverse of each block on the diagonal.
+  return linear_operator_block_diag.LinearOperatorBlockDiag(
+      operators=[
+          operator.inverse() for operator in block_diag_operator.operators],
+      is_non_singular=block_diag_operator.is_non_singular,
+      is_self_adjoint=block_diag_operator.is_self_adjoint,
+      is_positive_definite=block_diag_operator.is_positive_definite,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterInverse(
+    linear_operator_kronecker.LinearOperatorKronecker)
+def _inverse_kronecker(kronecker_operator):
+    # Inverse decomposition of a Kronecker product is the Kronecker product
+    # of inverse decompositions.
+  return linear_operator_kronecker.LinearOperatorKronecker(
+      operators=[
+          operator.inverse() for operator in kronecker_operator.operators],
+      is_non_singular=kronecker_operator.is_non_singular,
+      is_self_adjoint=kronecker_operator.is_self_adjoint,
+      is_positive_definite=kronecker_operator.is_positive_definite,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterInverse(
+    linear_operator_circulant.LinearOperatorCirculant)
+def _inverse_circulant(circulant_operator):
+  # Inverting the spectrum is sufficient to get the inverse.
+  return linear_operator_circulant.LinearOperatorCirculant(
+      spectrum=1. / circulant_operator.spectrum,
+      is_non_singular=circulant_operator.is_non_singular,
+      is_self_adjoint=circulant_operator.is_self_adjoint,
+      is_positive_definite=circulant_operator.is_positive_definite,
+      is_square=True)
diff --git a/tensorflow/python/ops/linalg/linalg.py b/tensorflow/python/ops/linalg/linalg.py
index ac4fd4ebc6059a187828c757c852a470d8ee69a8..eebe741337d8eefae44e5206ce990edbf261bdd9 100644
--- a/tensorflow/python/ops/linalg/linalg.py
+++ b/tensorflow/python/ops/linalg/linalg.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import,unused-import
 from tensorflow.python.ops.linalg import cholesky_registrations as _cholesky_registrations
+from tensorflow.python.ops.linalg import inverse_registrations as _inverse_registrations
 from tensorflow.python.ops.linalg import linear_operator_algebra as _linear_operator_algebra
 from tensorflow.python.ops.linalg import matmul_registrations as _matmul_registrations
 from tensorflow.python.ops.linalg.linalg_impl import *
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index df2bd887cdde6f651db572c2bdfebd2bc0170716..2259eaa65cd1a857e369ee8673165c76c882df7e 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -104,6 +104,7 @@ def adjoint(matrix, name=None):
   tf.linalg.adjoint(x)  # [[1 - 1j, 4 - 4j],
                         #  [2 - 2j, 5 - 5j],
                         #  [3 - 3j, 6 - 6j]]
+  ```
 
   Args:
     matrix:  A `Tensor`. Must be `float16`, `float32`, `float64`, `complex64`,
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 6be81f4b34191414d3c4c00ac7158bfa1539ef27..4c99e86dc59a8c39abb57494ae84bcfdc13faa1b 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -847,6 +847,31 @@ class LinearOperator(object):
 
       return self._solvevec(rhs, adjoint=adjoint)
 
+  def inverse(self, name="inverse"):
+    """Returns the Inverse of this `LinearOperator`.
+
+    Given `A` representing this `LinearOperator`, return a `LinearOperator`
+    representing `A^-1`.
+
+    Args:
+      name: A name scope to use for ops added by this method.
+
+    Returns:
+      `LinearOperator` representing inverse of this matrix.
+
+    Raises:
+      ValueError: When the `LinearOperator` is not hinted to be `non_singular`.
+    """
+    if self.is_square is False:  # pylint: disable=g-bool-id-comparison
+      raise ValueError("Cannot take the Inverse: This operator represents "
+                       "a non square matrix.")
+    if self.is_non_singular is False:  # pylint: disable=g-bool-id-comparison
+      raise ValueError("Cannot take the Inverse: This operator represents "
+                       "a singular matrix.")
+
+    with self._name_scope(name):
+      return linear_operator_algebra.inverse(self)
+
   def cholesky(self, name="cholesky"):
     """Returns a Cholesky factor as a `LinearOperator`.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_algebra.py b/tensorflow/python/ops/linalg/linear_operator_algebra.py
index 7b99066e4c121ebd7546dfad1039c0dfa46bca11..c1513fdb38c6005c89f6994141797f7df5c65350 100644
--- a/tensorflow/python/ops/linalg/linear_operator_algebra.py
+++ b/tensorflow/python/ops/linalg/linear_operator_algebra.py
@@ -27,6 +27,7 @@ from tensorflow.python.util import tf_inspect
 
 _CHOLESKY_DECOMPS = {}
 _MATMUL = {}
+_INVERSES = {}
 
 
 def _registered_function(type_list, registry):
@@ -55,6 +56,11 @@ def _registered_matmul(type_a, type_b):
   return _registered_function([type_a, type_b], _MATMUL)
 
 
+def _registered_inverse(type_a):
+  """Get the Cholesky function registered for class a."""
+  return _registered_function([type_a], _INVERSES)
+
+
 def cholesky(lin_op_a, name=None):
   """Get the Cholesky factor associated to lin_op_a.
 
@@ -103,6 +109,29 @@ def matmul(lin_op_a, lin_op_b, name=None):
     return matmul_fn(lin_op_a, lin_op_b)
 
 
+def inverse(lin_op_a, name=None):
+  """Get the Inverse associated to lin_op_a.
+
+  Args:
+    lin_op_a: The LinearOperator to decompose.
+    name: Name to use for this operation.
+
+  Returns:
+    A LinearOperator that represents the inverse of `lin_op_a`.
+
+  Raises:
+    NotImplementedError: If no Inverse method is defined for the LinearOperator
+      type of `lin_op_a`.
+  """
+  inverse_fn = _registered_inverse(type(lin_op_a))
+  if inverse_fn is None:
+    raise ValueError("No inverse registered for {}".format(
+        type(lin_op_a)))
+
+  with ops.name_scope(name, "Inverse"):
+    return inverse_fn(lin_op_a)
+
+
 class RegisterCholesky(object):
   """Decorator to register a Cholesky implementation function.
 
@@ -189,3 +218,45 @@ class RegisterMatmul(object):
           self._key[1].__name__))
     _MATMUL[self._key] = matmul_fn
     return matmul_fn
+
+
+class RegisterInverse(object):
+  """Decorator to register an Inverse implementation function.
+
+  Usage:
+
+  @linear_operator_algebra.RegisterInverse(lin_op.LinearOperatorIdentity)
+  def _inverse_identity(lin_op_a):
+    # Return the identity matrix.
+  """
+
+  def __init__(self, lin_op_cls_a):
+    """Initialize the LinearOperator registrar.
+
+    Args:
+      lin_op_cls_a: the class of the LinearOperator to decompose.
+    """
+    self._key = (lin_op_cls_a,)
+
+  def __call__(self, inverse_fn):
+    """Perform the Inverse registration.
+
+    Args:
+      inverse_fn: The function to use for the Inverse.
+
+    Returns:
+      inverse_fn
+
+    Raises:
+      TypeError: if inverse_fn is not a callable.
+      ValueError: if a Inverse function has already been registered for
+        the given argument classes.
+    """
+    if not callable(inverse_fn):
+      raise TypeError(
+          "inverse_fn must be callable, received: {}".format(inverse_fn))
+    if self._key in _INVERSES:
+      raise ValueError("Inverse({}) has already been registered to: {}".format(
+          self._key[0].__name__, _INVERSES[self._key]))
+    _INVERSES[self._key] = inverse_fn
+    return inverse_fn
diff --git a/tensorflow/python/ops/linalg/linear_operator_kronecker.py b/tensorflow/python/ops/linalg/linear_operator_kronecker.py
index f7e785caa5d8cc290f037944378f709633423a74..005b9b429b6327211feb9466bdca59b7a50870a7 100644
--- a/tensorflow/python/ops/linalg/linear_operator_kronecker.py
+++ b/tensorflow/python/ops/linalg/linear_operator_kronecker.py
@@ -71,7 +71,7 @@ class LinearOperatorKronecker(linear_operator.LinearOperator):
   `op1 x op2 x .. opJ` (we omit parentheses as the Kronecker product is
   associative).
 
-  If `opj` has shape `batch_shape_j` + [M_j, N_j`, then the composed operator
+  If `opj` has shape `batch_shape_j + [M_j, N_j]`, then the composed operator
   will have shape equal to `broadcast_batch_shape + [prod M_j, prod N_j]`,
   where the product is over all operators.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_test_util.py b/tensorflow/python/ops/linalg/linear_operator_test_util.py
index e50f572b5f431ae8b7cf3470ee799f170e83656c..a957c84dc1ca6f26927ae3c39c6cb49caa2b19be 100644
--- a/tensorflow/python/ops/linalg/linear_operator_test_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_test_util.py
@@ -336,6 +336,22 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     self._skip_if_tests_to_skip_contains("solve_with_broadcast")
     self._test_solve(with_batch=False)
 
+  def _test_inverse(self):
+    for use_placeholder in self._use_placeholder_options:
+      for build_info in self._operator_build_infos:
+        for dtype in self._dtypes_to_test:
+          with self.session(graph=ops.Graph()) as sess:
+            sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+            operator, mat = self._operator_and_matrix(
+                build_info, dtype, use_placeholder=use_placeholder)
+            op_inverse_v, mat_inverse_v = sess.run([
+                operator.inverse().to_dense(), linalg.inv(mat)])
+            self.assertAC(op_inverse_v, mat_inverse_v)
+
+  def test_inverse(self):
+    self._skip_if_tests_to_skip_contains("inverse")
+    self._test_inverse()
+
   def test_trace(self):
     self._skip_if_tests_to_skip_contains("trace")
     for use_placeholder in self._use_placeholder_options:
@@ -463,7 +479,14 @@ class NonSquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
   @property
   def _tests_to_skip(self):
     """List of test names to skip."""
-    return ["cholesky", "solve", "solve_with_broadcast", "det", "log_abs_det"]
+    return [
+        "cholesky",
+        "inverse",
+        "solve",
+        "solve_with_broadcast",
+        "det",
+        "log_abs_det"
+    ]
 
   @property
   def _operator_build_infos(self):
diff --git a/tensorflow/python/ops/linalg/linear_operator_util.py b/tensorflow/python/ops/linalg/linear_operator_util.py
index 54d04e4a70bc65e0053575e7761680894e3702e5..6c18943dab03d434cb92d5510f48066f46615ba5 100644
--- a/tensorflow/python/ops/linalg/linear_operator_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_util.py
@@ -481,9 +481,9 @@ def _reshape_for_efficiency(a,
 
   # Permutation to put the extra dims at the end.
   perm = (
-      array_ops.concat(
-          (math_ops.range(b_extra_ndims, b.shape.ndims),
-           math_ops.range(0, b_extra_ndims)), 0))
+      np.concatenate(
+          (np.arange(b_extra_ndims, b.shape.ndims),
+           np.arange(0, b_extra_ndims)), 0))
   b_extra_on_end = array_ops.transpose(b, perm=perm)
 
   # Now squash this end into one long dim.
@@ -497,7 +497,7 @@ def _reshape_for_efficiency(a,
     y_extra_shape = array_ops.concat(
         (array_ops.shape(y)[:-1], [b_main_sh[-1]], b_extra_sh), 0)
     y_extra_on_end = array_ops.reshape(y, y_extra_shape)
-    return array_ops.transpose(
-        y_extra_on_end, perm=array_ops.invert_permutation(perm))
+    inverse_perm = np.argsort(perm)
+    return array_ops.transpose(y_extra_on_end, perm=inverse_perm)
 
   return a, b_squashed_end, reshape_inv, still_need_to_transpose
diff --git a/tensorflow/python/ops/list_ops.py b/tensorflow/python/ops/list_ops.py
index dbaae886d43e46ac193d1e7f28a6367192d2a640..df928ea85df895ffd3439f5b8a2a430f8c517cd5 100644
--- a/tensorflow/python/ops/list_ops.py
+++ b/tensorflow/python/ops/list_ops.py
@@ -28,6 +28,12 @@ from tensorflow.python.ops import gen_list_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_list_ops import *
 # pylint: enable=wildcard-import
+from tensorflow.python.util.lazy_loader import LazyLoader
+
+# list_ops -> control_flow_ops -> tensor_array_ops -> list_ops
+control_flow_ops = LazyLoader(
+    "control_flow_ops", globals(),
+    "tensorflow.python.ops.control_flow_ops")
 
 
 ops.NotDifferentiable("TensorListConcatLists")
@@ -65,11 +71,13 @@ def tensor_list_from_tensor(tensor, element_shape, name=None):
       name=name)
 
 
-def tensor_list_concat(input_handle, element_dtype, name=None):
+def tensor_list_concat(input_handle, element_dtype, element_shape=None,
+                       name=None):
   # Ignore the lengths output of TensorListConcat. It is only used during
   # gradient computation.
   return gen_list_ops.tensor_list_concat(
-      input_handle=input_handle, element_dtype=element_dtype, name=name)[0]
+      input_handle=input_handle, element_dtype=element_dtype,
+      element_shape=element_shape, name=name)[0]
 
 
 def tensor_list_split(tensor, element_shape, lengths, name=None):
@@ -80,6 +88,25 @@ def tensor_list_split(tensor, element_shape, lengths, name=None):
       name=name)
 
 
+def tensor_list_set_item(input_handle,
+                         index,
+                         item,
+                         resize_if_index_out_of_bounds=False,
+                         name=None):
+  """Sets `item` at `index` in input list."""
+  if resize_if_index_out_of_bounds:
+    input_list_size = gen_list_ops.tensor_list_length(input_handle)
+    # TODO(srbs): This could cause some slowdown. Consider fusing resize
+    # functionality in the SetItem op.
+    input_handle = control_flow_ops.cond(
+        index >= input_list_size,
+        lambda: gen_list_ops.tensor_list_resize(  # pylint: disable=g-long-lambda
+            input_handle, index + 1),
+        lambda: input_handle)
+  return gen_list_ops.tensor_list_set_item(
+      input_handle=input_handle, index=index, item=item, name=name)
+
+
 @ops.RegisterGradient("TensorListPushBack")
 def _PushBackGrad(op, dresult):
   return gen_list_ops.tensor_list_pop_back(
@@ -164,19 +191,32 @@ def _TensorListSetItemGrad(op, dlist):
   return list_grad, index_grad, element_grad
 
 
+@ops.RegisterGradient("TensorListResize")
+def _TensorListResizeGrad(op, dlist):
+  input_list, _ = op.inputs
+  input_list_size = gen_list_ops.tensor_list_length(input_list)
+  return gen_list_ops.tensor_list_resize(dlist, input_list_size), None
+
+
 @ops.RegisterGradient("TensorListGather")
 def _TensorListGatherGrad(op, dtensor):
-  _, indices = op.inputs
-  return gen_list_ops.tensor_list_scatter(
-      tensor=dtensor, indices=indices,
-      element_shape=ops.convert_to_tensor(-1, dtype=dtypes.int32)), None
+  input_list, indices = op.inputs
+  dlist = gen_list_ops.tensor_list_scatter(
+      tensor=dtensor,
+      indices=indices,
+      element_shape=ops.convert_to_tensor(-1, dtype=dtypes.int32))
+  # TensorListScatter returns a list with size `max(indices) + 1`
+  # so we manually resize it to match the size of the input list.
+  input_list_size = gen_list_ops.tensor_list_length(input_list)
+  dlist = gen_list_ops.tensor_list_resize(dlist, input_list_size)
+  return dlist, None
 
 
 @ops.RegisterGradient("TensorListScatter")
 def _TensorListScatterGrad(op, dlist):
   t, indices, _ = op.inputs
   return gen_list_ops.tensor_list_gather(
-      dlist, indices, element_dtype=t.dtype), None
+      dlist, indices, element_dtype=t.dtype), None, None
 
 
 def _build_element_shape(shape):
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 5a948a21946d0b9ce867901a00425857e4f06b1f..3cb16eb81e8c0796e199edb9c97acd1c269c832b 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -263,7 +263,7 @@ def print_v2(*inputs, **kwargs):
   # If we are only printing a single string scalar, there is no need to format
   if (len(inputs) == 1 and tensor_util.is_tensor(inputs[0])
       and (not isinstance(inputs[0], sparse_tensor.SparseTensor))
-      and inputs[0].shape and (inputs[0].dtype == dtypes.string)):
+      and (inputs[0].shape.ndims == 0)and (inputs[0].dtype == dtypes.string)):
     formatted_string = inputs[0]
   # Otherwise, we construct an appropriate template for the tensors we are
   # printing, and format the template using those tensors.
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index e96c93c15c27ebbdf833c6b97dd9f2ce8c0e4faa..aaeb52ca2587dd934846c12d27eea3dc43c05224 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -339,6 +339,16 @@ class TableInitializerBase(checkpointable_base.CheckpointableBase):
     """Returns the table initialization op."""
     raise NotImplementedError
 
+  @property
+  def _shared_name(self):
+    """Returns a shared name to be used by the table."""
+    shared_name = ""
+    if context.executing_eagerly():
+      # Ensure a unique name when eager execution is enabled to avoid spurious
+      # sharing issues.
+      shared_name += str(ops.uid())
+    return shared_name
+
 
 class KeyValueTensorInitializer(TableInitializerBase):
   """Table initializers given `keys` and `values` tensors."""
@@ -498,6 +508,7 @@ class TextFileInitializer(TableInitializerBase):
     if not isinstance(filename, ops.Tensor) and not filename:
       raise ValueError("Filename required for %s." % name)
 
+    self._filename_arg = filename
     key_dtype = dtypes.as_dtype(key_dtype)
     value_dtype = dtypes.as_dtype(value_dtype)
 
@@ -569,6 +580,23 @@ class TextFileInitializer(TableInitializerBase):
       ops.add_to_collection(ops.GraphKeys.ASSET_FILEPATHS, filename)
     return init_op
 
+  @property
+  def _shared_name(self):
+    if self._vocab_size:
+      # Keep the shared_name:
+      # <table_type>_<filename>_<vocab_size>_<key_index>_<value_index>
+      shared_name = "hash_table_%s_%d_%s_%s" % (self._filename_arg,
+                                                self._vocab_size,
+                                                self._key_index,
+                                                self._value_index)
+    else:
+      # Keep the shared_name
+      # <table_type>_<filename>_<key_index>_<value_index>
+      shared_name = "hash_table_%s_%s_%s" % (self._filename_arg,
+                                             self._key_index,
+                                             self._value_index)
+    return shared_name
+
 
 class TextFileStringTableInitializer(TextFileInitializer):
   """Table initializer for `int64` IDs to string tables from a text file."""
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 5bccf5493f3f56fdea5c7d17010d9ee9113b8833..248d092538462219d037534f06793d6cbb205af8 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -813,7 +813,8 @@ def _OverrideBinaryOperatorHelper(func, op_name, clazz_object=ops.Tensor):
         return func(x, y, name=name)
       elif not isinstance(y, sparse_tensor.SparseTensor):
         try:
-          y = ops.convert_to_tensor(y, dtype=x.dtype.base_dtype, name="y")
+          y = ops.convert_to_tensor_v2(y, dtype_hint=x.dtype.base_dtype,
+                                       name="y")
         except TypeError:
           # If the RHS is not a tensor, it might be a tensor aware object
           # that can implement the operator with knowledge of itself
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index b27cf7208c3e38467306f40ccf8c971e29a5642c..b4832e09c084e7165143f4e918b9ba76842e2311 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -22,6 +22,7 @@ import numpy as np
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -664,5 +665,22 @@ class NextAfterTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(math_ops.nextafter(one, two) - one, eps_const)
 
 
+class BinaryOpsTest(test_util.TensorFlowTestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testErrorReceivedIfDtypeMismatchFromOp(self):
+    if context.executing_eagerly():
+      error = errors_impl.InvalidArgumentError
+      error_message = (
+          r"cannot compute Add as input #0\(zero-based\) was expected to be a "
+          r"float tensor but is a int32 tensor \[Op:Add\] name: add/")
+    else:
+      error = TypeError
+      error_message = ("Input 'y' of 'Add' Op has type float32 that does not "
+                       "match type int32 of argument 'x'.")
+    with self.assertRaisesRegexp(error, error_message):
+      a = array_ops.ones([1], dtype=dtypes.int32) + 1.0
+      self.evaluate(a)
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 34404edc9a1250710d4cd7a50e04ad8d187a5d7f..6ca2b2aafe3145978e6610cded32719173368eb8 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -314,10 +314,10 @@ def _BiasAddGradGrad(op, received_grad):
 
   if data_format == b"NCHW":
     expanded_shape = array_ops.concat([
-        array_ops.ones_like(shape[:-3]), bias_shape,
-        array_ops.ones_like(shape[-2:])
+        array_ops.ones_like(shape[:1]), bias_shape,
+        array_ops.ones_like(shape[2:])
     ], 0)
-    tile_mults = array_ops.concat([shape[:-3], [1], shape[-2:]], 0)
+    tile_mults = array_ops.concat([shape[:1], [1], shape[2:]], 0)
   else:
     expanded_shape = array_ops.concat(
         [array_ops.ones_like(shape[:-1]), bias_shape], 0)
@@ -514,29 +514,40 @@ def _SparseSoftmaxCrossEntropyWithLogitsGrad(op, grad_0, _):
 
 @ops.RegisterGradient("Conv2D")
 def _Conv2DGrad(op, grad):
+  """Gradient function for Conv2D."""
   dilations = op.get_attr("dilations")
   strides = op.get_attr("strides")
   padding = op.get_attr("padding")
+  explicit_paddings = op.get_attr("explicit_paddings")
   use_cudnn_on_gpu = op.get_attr("use_cudnn_on_gpu")
   data_format = op.get_attr("data_format")
   shape_0, shape_1 = array_ops.shape_n([op.inputs[0], op.inputs[1]])
+
+  # We call the gen_nn_ops backprop functions instead of nn_ops backprop
+  # functions for performance reasons in Eager mode. gen_nn_ops functions take a
+  # `explicit_paddings` parameter, but nn_ops functions do not. So if were were
+  # to use the nn_ops functions, we would have to convert `padding` and
+  # `explicit_paddings` into a single `padding` parameter, increasing overhead
+  # in Eager mode.
   return [
-      nn_ops.conv2d_backprop_input(
+      gen_nn_ops.conv2d_backprop_input(
           shape_0,
           op.inputs[1],
           grad,
           dilations=dilations,
           strides=strides,
           padding=padding,
+          explicit_paddings=explicit_paddings,
           use_cudnn_on_gpu=use_cudnn_on_gpu,
           data_format=data_format),
-      nn_ops.conv2d_backprop_filter(
+      gen_nn_ops.conv2d_backprop_filter(
           op.inputs[0],
           shape_1,
           grad,
           dilations=dilations,
           strides=strides,
           padding=padding,
+          explicit_paddings=explicit_paddings,
           use_cudnn_on_gpu=use_cudnn_on_gpu,
           data_format=data_format)
   ]
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 841bac8bea064f7dce8e9015745c89e7d98fc4d7..7abfde5149acfb3da6b27e03f5ddd95fec746db6 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -467,7 +467,7 @@ def depthwise_conv2d(input,
   to `channel_multiplier` channels for each), then concatenates the results
   together.  The output has `in_channels * channel_multiplier` channels.
 
-  In detail,
+  In detail, with the default NHWC format,
 
       output[b, i, j, k * channel_multiplier + q] = sum_{di, dj}
            filter[di, dj, k, q] * input[b, strides[1] * i + rate[0] * di,
@@ -540,7 +540,7 @@ def depthwise_conv2d_v2(input,
   to `channel_multiplier` channels for each), then concatenates the results
   together.  The output has `in_channels * channel_multiplier` channels.
 
-  In detail,
+  In detail, with the default NHWC format,
 
       output[b, i, j, k * channel_multiplier + q] = sum_{di, dj}
            filter[di, dj, k, q] * input[b, strides[1] * i + rate[0] * di,
@@ -599,7 +599,7 @@ def separable_conv2d(input,
   between dimensions `[1, 2]` and `3`, not spatial separability between
   dimensions `1` and `2`.
 
-  In detail,
+  In detail, with the default NHWC format,
 
       output[b, i, j, k] = sum_{di, dj, q, r}
           input[b, strides[1] * i + di, strides[2] * j + dj, q] *
@@ -699,7 +699,7 @@ def separable_conv2d_v2(
   between dimensions `[1, 2]` and `3`, not spatial separability between
   dimensions `1` and `2`.
 
-  In detail,
+  In detail, with the default NHWC format,
 
       output[b, i, j, k] = sum_{di, dj, q, r}
           input[b, strides[1] * i + di, strides[2] * j + dj, q] *
@@ -1380,6 +1380,8 @@ def _compute_sampled_logits(weights,
     # weights shape is [num_classes, dim]
     all_w = embedding_ops.embedding_lookup(
         weights, all_ids, partition_strategy=partition_strategy)
+    if all_w.dtype != inputs.dtype:
+      all_w = math_ops.cast(all_w, inputs.dtype)
 
     # true_w shape is [batch_size * num_true, dim]
     true_w = array_ops.slice(all_w, [0, 0],
@@ -1397,6 +1399,8 @@ def _compute_sampled_logits(weights,
     # add the biases to the true and sampled logits.
     all_b = embedding_ops.embedding_lookup(
         biases, all_ids, partition_strategy=partition_strategy)
+    if all_b.dtype != inputs.dtype:
+      all_b = math_ops.cast(all_b, inputs.dtype)
     # true_b is a [batch_size * num_true] tensor
     # sampled_b is a [num_sampled] float tensor
     true_b = array_ops.slice(all_b, [0], array_ops.shape(labels_flat))
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 6f2d2c15bd40109b79e7497c6b279fd8edf23bd7..f71fcef13051a8d05b65c648d775148a97470b27 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -171,7 +171,7 @@ class _NonAtrousConvolution(object):
         raise ValueError("data_format must be \"NHWC\" or \"NCHW\".")
       self.strides = strides
       self.data_format = data_format
-      self.conv_op = gen_nn_ops.conv2d
+      self.conv_op = conv2d
     elif conv_dims == 3:
       if data_format is None or data_format == "NDHWC":
         strides = [1] + list(strides) + [1]
@@ -1373,6 +1373,44 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
       name=name)
 
 
+def _convert_padding(padding):
+  """Converts Python padding to C++ padding for ops which take EXPLICIT padding.
+
+  Args:
+    padding: the `padding` argument for a Python op which supports EXPLICIT
+      padding.
+
+  Returns:
+    (padding, explicit_paddings) pair, which should be passed as attributes to a
+    C++ op.
+
+  Raises:
+    ValueError: If padding is invalid.
+  """
+  explicit_paddings = []
+  if padding == "EXPLICIT":
+    # Give a better error message if EXPLICIT is passed.
+    raise ValueError('"EXPLICIT" is not a valid value for the padding '
+                     "parameter. To use explicit padding, the padding "
+                     "parameter must be a list.")
+  if isinstance(padding, (list, tuple)):
+    for i, dim_paddings in enumerate(padding):
+      if not isinstance(dim_paddings, (list, tuple)):
+        raise ValueError("When padding is a list, each element of padding must "
+                         "be a list/tuple of size 2. Element with index %d of "
+                         "padding is not a list/tuple" % i)
+      if len(dim_paddings) != 2:
+        raise ValueError("When padding is a list, each element of padding must "
+                         "be a list/tuple of size 2. Element with index %d of "
+                         "padding has size %d" % (i, len(dim_paddings)))
+      explicit_paddings.extend(dim_paddings)
+    if len(padding) != 4:
+      raise ValueError("When padding is a list, it must be of size 4. Got "
+                       "padding of size: %d" % len(padding))
+    padding = "EXPLICIT"
+  return padding, explicit_paddings
+
+
 @tf_export("nn.conv2d", v1=[])
 def conv2d_v2(input,  # pylint: disable=redefined-builtin
               filters,
@@ -1418,8 +1456,13 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
       1-D tensor of length 4.  The stride of the sliding window for each
       dimension of `input`. The dimension order is determined by the value of
       `data_format`, see below for details.
-    padding: A `string` from: `"SAME", "VALID"`.
-      The type of padding algorithm to use.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
+      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
+      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right]]`.
     data_format: An optional `string` from: `"NHWC", "NCHW"`.
       Defaults to `"NHWC"`.
       Specify the data format of the input and output data. With the
@@ -1441,15 +1484,98 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
   # pylint: enable=line-too-long
   if dilations is None:
     dilations = [1, 1, 1, 1]
+  return conv2d(input,  # pylint: disable=redefined-builtin
+                filters,
+                strides,
+                padding,
+                use_cudnn_on_gpu=True,
+                data_format=data_format,
+                dilations=dilations,
+                name=name)
+
+
+@tf_export(v1=["nn.conv2d"])
+def conv2d(  # pylint: disable=redefined-builtin,dangerous-default-value
+    input,
+    filter,
+    strides,
+    padding,
+    use_cudnn_on_gpu=True,
+    data_format="NHWC",
+    dilations=[1, 1, 1, 1],
+    name=None):
+  r"""Computes a 2-D convolution given 4-D `input` and `filter` tensors.
+
+  Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+  and a filter / kernel tensor of shape
+  `[filter_height, filter_width, in_channels, out_channels]`, this op
+  performs the following:
+
+  1. Flattens the filter to a 2-D matrix with shape
+     `[filter_height * filter_width * in_channels, output_channels]`.
+  2. Extracts image patches from the input tensor to form a *virtual*
+     tensor of shape `[batch, out_height, out_width,
+     filter_height * filter_width * in_channels]`.
+  3. For each patch, right-multiplies the filter matrix and the image patch
+     vector.
+
+  In detail, with the default NHWC format,
+
+      output[b, i, j, k] =
+          sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q]
+                          * filter[di, dj, q, k]
+
+  Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+  horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+
+  Args:
+    input: A `Tensor`. Must be one of the following types:
+      `half`, `bfloat16`, `float32`, `float64`.
+      A 4-D tensor. The dimension order is interpreted according to the value
+      of `data_format`, see below for details.
+    filter: A `Tensor`. Must have the same type as `input`.
+      A 4-D tensor of shape
+      `[filter_height, filter_width, in_channels, out_channels]`
+    strides: A list of `ints`.
+      1-D tensor of length 4.  The stride of the sliding window for each
+      dimension of `input`. The dimension order is determined by the value of
+      `data_format`, see below for details.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
+      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
+      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right]]`.
+    use_cudnn_on_gpu: An optional `bool`. Defaults to `True`.
+    data_format: An optional `string` from: `"NHWC", "NCHW"`.
+      Defaults to `"NHWC"`.
+      Specify the data format of the input and output data. With the
+      default format "NHWC", the data is stored in the order of:
+          [batch, height, width, channels].
+      Alternatively, the format could be "NCHW", the data storage order of:
+          [batch, channels, height, width].
+    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
+      1-D tensor of length 4.  The dilation factor for each dimension of
+      `input`. If set to k > 1, there will be k-1 skipped cells between each
+      filter element on that dimension. The dimension order is determined by the
+      value of `data_format`, see above for details. Dilations in the batch and
+      depth dimensions must be 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `input`.
+  """
+  padding, explicit_paddings = _convert_padding(padding)
   return gen_nn_ops.conv2d(input,  # pylint: disable=redefined-builtin
-                           filters,
+                           filter,
                            strides,
                            padding,
-                           use_cudnn_on_gpu=True,
+                           use_cudnn_on_gpu=use_cudnn_on_gpu,
+                           explicit_paddings=explicit_paddings,
                            data_format=data_format,
                            dilations=dilations,
                            name=name)
-tf_export(v1=["nn.conv2d"])(gen_nn_ops.conv2d)
 
 
 @tf_export("nn.conv2d_backprop_filter", v1=[])
@@ -1478,8 +1604,13 @@ def conv2d_backprop_filter_v2(input,  # pylint: disable=redefined-builtin
       The stride of the sliding window for each dimension of the input
       of the convolution. Must be in the same order as the dimension specified
       with format.
-    padding: A `string` from: `"SAME", "VALID"`.
-      The type of padding algorithm to use.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
+      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
+      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right]]`.
     data_format: An optional `string` from: `"NHWC", "NCHW"`.
       Defaults to `"NHWC"`.
       Specify the data format of the input and output data. With the
@@ -1500,17 +1631,75 @@ def conv2d_backprop_filter_v2(input,  # pylint: disable=redefined-builtin
   """
   if dilations is None:
     dilations = [1, 1, 1, 1]
-  return gen_nn_ops.conv2d_backprop_filter(input,  # pylint: disable=redefined-builtin
-                                           filter_sizes,
-                                           out_backprop,
-                                           strides,
-                                           padding,
-                                           use_cudnn_on_gpu=True,
-                                           data_format=data_format,
-                                           dilations=dilations,
-                                           name=name)
-tf_export(v1=["nn.conv2d_backprop_filter"])(
-    gen_nn_ops.conv2d_backprop_filter)
+  return conv2d_backprop_filter(input,  # pylint: disable=redefined-builtin
+                                filter_sizes,
+                                out_backprop,
+                                strides,
+                                padding,
+                                use_cudnn_on_gpu=True,
+                                data_format=data_format,
+                                dilations=dilations,
+                                name=name)
+
+
+@tf_export(v1=["nn.conv2d_backprop_filter"])
+def conv2d_backprop_filter(  # pylint: disable=redefined-builtin,dangerous-default-value
+    input,
+    filter_sizes,
+    out_backprop,
+    strides,
+    padding,
+    use_cudnn_on_gpu=True,
+    data_format="NHWC",
+    dilations=[1, 1, 1, 1],
+    name=None):
+  r"""Computes the gradients of convolution with respect to the filter.
+
+  Args:
+    input: A `Tensor`. Must be one of the following types:
+      `half`, `bfloat16`, `float32`, `float64`.
+      4-D with shape `[batch, in_height, in_width, in_channels]`.
+    filter_sizes: A `Tensor` of type `int32`.
+      An integer vector representing the tensor shape of `filter`,
+      where `filter` is a 4-D
+      `[filter_height, filter_width, in_channels, out_channels]` tensor.
+    out_backprop: A `Tensor`. Must have the same type as `input`.
+      4-D with shape `[batch, out_height, out_width, out_channels]`.
+      Gradients w.r.t. the output of the convolution.
+    strides: A list of `ints`.
+      The stride of the sliding window for each dimension of the input
+      of the convolution. Must be in the same order as the dimension specified
+      with format.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
+      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
+      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right]]`.
+    use_cudnn_on_gpu: An optional `bool`. Defaults to `True`.
+    data_format: An optional `string` from: `"NHWC", "NCHW"`.
+      Defaults to `"NHWC"`.
+      Specify the data format of the input and output data. With the
+      default format "NHWC", the data is stored in the order of:
+          [batch, in_height, in_width, in_channels].
+      Alternatively, the format could be "NCHW", the data storage order of:
+          [batch, in_channels, in_height, in_width].
+    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
+      1-D tensor of length 4.  The dilation factor for each dimension of
+      `input`. If set to k > 1, there will be k-1 skipped cells between each
+      filter element on that dimension. The dimension order is determined by
+      the value of `data_format`, see above for details. Dilations in the batch
+      and depth dimensions must be 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `input`.
+  """
+  padding, explicit_paddings = _convert_padding(padding)
+  return gen_nn_ops.conv2d_backprop_filter(
+      input, filter_sizes, out_backprop, strides, padding, use_cudnn_on_gpu,
+      explicit_paddings, data_format, dilations, name)
 
 
 @tf_export("nn.conv2d_backprop_input", v1=[])
@@ -1539,8 +1728,13 @@ def conv2d_backprop_input_v2(input_sizes,
       The stride of the sliding window for each dimension of the input
       of the convolution. Must be in the same order as the dimension specified
       with format.
-    padding: A `string` from: `"SAME", "VALID"`.
-      The type of padding algorithm to use.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
+      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
+      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right]]`.
     data_format: An optional `string` from: `"NHWC", "NCHW"`.
       Defaults to `"NHWC"`.
       Specify the data format of the input and output data. With the
@@ -1561,17 +1755,75 @@ def conv2d_backprop_input_v2(input_sizes,
   """
   if dilations is None:
     dilations = [1, 1, 1, 1]
-  return gen_nn_ops.conv2d_backprop_input(input_sizes,
-                                          filters,
-                                          out_backprop,
-                                          strides,
-                                          padding,
-                                          use_cudnn_on_gpu=True,
-                                          data_format=data_format,
-                                          dilations=dilations,
-                                          name=name)
-tf_export(v1=["nn.conv2d_backprop_input"])(
-    gen_nn_ops.conv2d_backprop_input)
+  return conv2d_backprop_input(input_sizes,
+                               filters,
+                               out_backprop,
+                               strides,
+                               padding,
+                               use_cudnn_on_gpu=True,
+                               data_format=data_format,
+                               dilations=dilations,
+                               name=name)
+
+
+@tf_export(v1=["nn.conv2d_backprop_input"])
+def conv2d_backprop_input(  # pylint: disable=redefined-builtin,dangerous-default-value
+    input_sizes,
+    filter,
+    out_backprop,
+    strides,
+    padding,
+    use_cudnn_on_gpu=True,
+    data_format="NHWC",
+    dilations=[1, 1, 1, 1],
+    name=None):
+  r"""Computes the gradients of convolution with respect to the input.
+
+  Args:
+    input_sizes: A `Tensor` of type `int32`.
+      An integer vector representing the shape of `input`,
+      where `input` is a 4-D `[batch, height, width, channels]` tensor.
+    filter: A `Tensor`. Must be one of the following types:
+      `half`, `bfloat16`, `float32`, `float64`.
+      4-D with shape
+      `[filter_height, filter_width, in_channels, out_channels]`.
+    out_backprop: A `Tensor`. Must have the same type as `filter`.
+      4-D with shape `[batch, out_height, out_width, out_channels]`.
+      Gradients w.r.t. the output of the convolution.
+    strides: A list of `ints`.
+      The stride of the sliding window for each dimension of the input
+      of the convolution. Must be in the same order as the dimension specified
+      with format.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
+      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
+      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right]]`.
+    use_cudnn_on_gpu: An optional `bool`. Defaults to `True`.
+    data_format: An optional `string` from: `"NHWC", "NCHW"`.
+      Defaults to `"NHWC"`.
+      Specify the data format of the input and output data. With the
+      default format "NHWC", the data is stored in the order of:
+          [batch, in_height, in_width, in_channels].
+      Alternatively, the format could be "NCHW", the data storage order of:
+          [batch, in_channels, in_height, in_width].
+    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
+      1-D tensor of length 4.  The dilation factor for each dimension of
+      `input`. If set to k > 1, there will be k-1 skipped cells between each
+      filter element on that dimension. The dimension order is determined by
+      the value of `data_format`, see above for details. Dilations in the batch
+      and depth dimensions must be 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `filter`.
+  """
+  padding, explicit_paddings = _convert_padding(padding)
+  return gen_nn_ops.conv2d_backprop_input(
+      input_sizes, filter, out_backprop, strides, padding, use_cudnn_on_gpu,
+      explicit_paddings, data_format, dilations, name)
 
 
 @tf_export(v1=["nn.conv2d_transpose"])
diff --git a/tensorflow/python/ops/parallel_for/BUILD b/tensorflow/python/ops/parallel_for/BUILD
index 07fc9433a2582225a8da687eb8c9563c8fcac9e2..1028963b1ae3d1f3ec074c343d2b2ca6553d4376 100644
--- a/tensorflow/python/ops/parallel_for/BUILD
+++ b/tensorflow/python/ops/parallel_for/BUILD
@@ -15,11 +15,13 @@ py_library(
         "control_flow_ops.py",
         "gradients.py",
         "pfor.py",
+        "test_util.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
         ":control_flow_ops",
         ":gradients",
+        ":test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:constant_op",
@@ -83,12 +85,25 @@ py_library(
     ],
 )
 
+py_library(
+    name = "test_util",
+    srcs = ["test_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pfor_lib",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+    ],
+)
+
 cuda_py_test(
     name = "control_flow_ops_test",
-    size = "large",
     srcs = ["control_flow_ops_test.py"],
     additional_deps = [
         ":control_flow_ops",
+        ":test_util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:gradients",
@@ -101,6 +116,34 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "array_test",
+    srcs = ["array_test.py"],
+    additional_deps = [
+        ":control_flow_ops",
+        ":test_util",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python/eager:backprop",
+    ],
+)
+
+cuda_py_test(
+    name = "math_test",
+    srcs = ["math_test.py"],
+    additional_deps = [
+        ":control_flow_ops",
+        ":test_util",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:util",
+    ],
+    tags = ["optonly"],  # Too slow in non-opt mode
+)
+
 py_library(
     name = "gradients",
     srcs = ["gradients.py"],
@@ -115,7 +158,6 @@ py_library(
 
 cuda_py_test(
     name = "gradients_test",
-    size = "large",
     srcs = ["gradients_test.py"],
     additional_deps = [
         ":control_flow_ops",
@@ -128,4 +170,5 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python/ops/losses",
     ],
+    tags = ["optonly"],  # Too slow in non-opt mode
 )
diff --git a/tensorflow/python/ops/parallel_for/array_test.py b/tensorflow/python/ops/parallel_for/array_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f0c0f5b992b3f005dc8b75a6d0207237a5205bb
--- /dev/null
+++ b/tensorflow/python/ops/parallel_for/array_test.py
@@ -0,0 +1,274 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for vectorization of array kernels."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
+from tensorflow.python.ops.parallel_for import control_flow_ops as pfor_control_flow_ops
+from tensorflow.python.ops.parallel_for.test_util import PForTestCase
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class ArrayTest(PForTestCase):
+
+  def test_gather(self):
+    x = random_ops.random_uniform([3, 3, 3])
+
+    def loop_fn(i):
+      outputs = []
+      x_i = array_ops.gather(x, i)
+      for y in [x, x_i]:
+        axes = [0, 2, -1] if y == x else [0]
+        for axis in axes:
+          outputs.append(array_ops.gather(y, 2, axis=axis))
+          outputs.append(array_ops.gather(y, i, axis=axis))
+          outputs.append(array_ops.gather(y, [i], axis=axis))
+          outputs.append(array_ops.gather(y, [i, 2], axis=axis))
+          outputs.append(array_ops.gather(y, [[2, i], [i, 1]], axis=axis))
+      return outputs
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 20)
+
+  def test_shape(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x_i = array_ops.gather(x, i)
+      return array_ops.shape(x_i), array_ops.shape(x_i, out_type=dtypes.int64)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32, dtypes.int64])
+
+  def test_size(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x_i = array_ops.gather(x, i)
+      return array_ops.size(x_i), array_ops.size(x_i, out_type=dtypes.int64)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32, dtypes.int64])
+
+  def test_rank(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x_i = array_ops.gather(x, i)
+      return array_ops.rank(x_i)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32])
+
+  def test_shape_n(self):
+    x = random_ops.random_uniform([3, 2, 3])
+    y = random_ops.random_uniform([3])
+
+    def loop_fn(i):
+      x_i = array_ops.gather(x, i)
+      y_i = array_ops.gather(y, i)
+      return array_ops.shape_n([x_i, x, y, y_i]), array_ops.shape_n(
+          [x_i, x, y, y_i], out_type=dtypes.int64)
+
+    self._test_loop_fn(
+        loop_fn, 3, loop_fn_dtypes=[dtypes.int32] * 4 + [dtypes.int64] * 4)
+
+  def test_reshape(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.reshape(x1, [-1]), array_ops.reshape(x1, [1, 3, 1, -1])
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+
+  def test_expand_dims(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.expand_dims(
+          x1, axis=-1), array_ops.expand_dims(
+              x1, axis=1)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+
+  def test_slice(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.slice(x1, begin=(0, 1), size=(2, 1))
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_tile(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.tile(x1, [2, 1])
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_tile_loop_dependent(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.tile(x1, [i, 1])
+
+    with self.assertRaisesRegexp(ValueError, "expected to be loop invariant"):
+      pfor_control_flow_ops.pfor(loop_fn, 2)
+
+  def test_pack(self):
+    x = random_ops.random_uniform([3, 2, 3])
+    y = random_ops.random_uniform([2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.stack([x1, y], axis=-1)
+
+    self._test_loop_fn(loop_fn, 1)
+
+  def test_unpack(self):
+    x = random_ops.random_uniform([3, 2, 3, 4])
+
+    def loop_fn(i):
+      x_i = array_ops.gather(x, i)
+      return array_ops.unstack(
+          x_i, 4, axis=-1), array_ops.unstack(
+              x_i, 3, axis=1)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 7)
+
+  def test_pad(self):
+    x = random_ops.random_uniform([3, 2, 3])
+    padding = constant_op.constant([[1, 2], [3, 4]])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.pad(x1, padding, mode="CONSTANT")
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_split(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.split(x1, 2, axis=0), array_ops.split(x1, 3, axis=-1)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 5)
+
+  def test_split_v(self):
+    x = random_ops.random_uniform([3, 6, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return (array_ops.split(x1, [2, 1, 3], axis=0),
+              array_ops.split(x1, [3], axis=-1))
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 4)
+
+  def test_transpose(self):
+    x = random_ops.random_uniform([3, 2, 3, 4])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.transpose(x1, [2, 1, 0])
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_zeros_like(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      z = array_ops.zeros_like(x1),
+      return z, z + x1
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+
+  def test_concat_v2(self):
+    x = random_ops.random_uniform([3, 2, 3])
+    y = random_ops.random_uniform([2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.concat(
+          [x1, x1, y], axis=0), array_ops.concat(
+              [x1, x1, y], axis=-1)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+
+  def test_unary_cwise_ops(self):
+    for op in [array_ops.identity, array_ops.stop_gradient]:
+      with backprop.GradientTape(persistent=True) as g:
+        x = random_ops.random_uniform([3, 5])
+        g.watch(x)
+
+      # pylint: disable=cell-var-from-loop
+      def loop_fn(i):
+        with g:
+          x1 = array_ops.gather(x, i)
+          y = op(x1) + x1
+          loss = nn.l2_loss(y)
+        return op(x), y, g.gradient(loss, x1)
+
+      # pylint: enable=cell-var-from-loop
+
+      self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 3)
+
+  def test_identity_n(self):
+    x = random_ops.random_uniform([3, 4])
+
+    def loop_fn(i):
+      return array_ops.identity_n([x, array_ops.gather(x, i)])
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+
+  def test_matrix_diag_part(self):
+    x = random_ops.random_uniform([3, 4, 2])
+
+    def loop_fn(i):
+      return array_ops.matrix_diag_part(array_ops.gather(x, i))
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32])
+
+  def test_strided_slice(self):
+    with backprop.GradientTape(persistent=True) as g:
+      x = random_ops.random_uniform([3, 3, 4, 4, 2, 2, 2])
+      g.watch(x)
+
+    def loop_fn(i):
+      with g:
+        x_i = array_ops.gather(x, i)
+        y = x_i[:2, ::2, 1::3, ..., array_ops.newaxis, 1]
+        loss = nn.l2_loss(y)
+      return y, g.gradient(loss, x_i)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 8acb0d839c59ce87652d2029deb014958fd013cc..1a8f639d43eebde51f48a6da5615fcda9e5f7cf0 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -27,7 +27,6 @@ from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
-from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -35,7 +34,6 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import bitwise_ops
-from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import functional_ops
@@ -51,40 +49,13 @@ from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-im
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.parallel_for import control_flow_ops as pfor_control_flow_ops
+from tensorflow.python.ops.parallel_for.test_util import PForTestCase
 from tensorflow.python.platform import test
 from tensorflow.python.util import nest
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class PForTest(test.TestCase):
-
-  def _run_targets(self, targets1, targets2=None, run_init=True):
-    targets1 = nest.flatten(targets1)
-    targets2 = ([] if targets2 is None else nest.flatten(targets2))
-    assert len(targets1) == len(targets2) or not targets2
-    if run_init:
-      init = variables.global_variables_initializer()
-      self.evaluate(init)
-    return self.evaluate(targets1 + targets2)
-
-  def run_and_assert_equal(self, targets1, targets2):
-    outputs = self._run_targets(targets1, targets2)
-    outputs = nest.flatten(outputs)  # flatten SparseTensorValues
-    n = len(outputs) // 2
-    for i in range(n):
-      if outputs[i + n].dtype != np.object:
-        self.assertAllClose(outputs[i + n], outputs[i], rtol=1e-4, atol=1e-5)
-      else:
-        self.assertAllEqual(outputs[i + n], outputs[i])
-
-  def _test_loop_fn(self, loop_fn, iters,
-                    loop_fn_dtypes=dtypes.float32,
-                    parallel_iterations=None):
-    t1 = pfor_control_flow_ops.pfor(loop_fn, iters=iters,
-                                    parallel_iterations=parallel_iterations)
-    t2 = pfor_control_flow_ops.for_loop(loop_fn, loop_fn_dtypes, iters=iters,
-                                        parallel_iterations=parallel_iterations)
-    self.run_and_assert_equal(t1, t2)
+class PForTest(PForTestCase):
 
   def test_op_conversion_fallback_to_while_loop(self):
     # Note that we used top_k op for this test. If a converter gets defined for
@@ -105,10 +76,6 @@ class PForTest(test.TestCase):
     flags.FLAGS.op_conversion_fallback_to_while_loop = False
 
   def test_parallel_iterations(self):
-    # TODO(b/121334512): Remove this check once this passes in Eager mode.
-    if context.executing_eagerly():
-      return
-
     for parallel_iterations in [2, 3, 8, 10]:
       x = random_ops.random_uniform([8, 3])
 
@@ -134,246 +101,7 @@ class PForTest(test.TestCase):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class ArrayTest(PForTest):
-
-  def test_gather(self):
-    x = random_ops.random_uniform([3, 3, 3])
-
-    def loop_fn(i):
-      outputs = []
-      x_i = array_ops.gather(x, i)
-      for y in [x, x_i]:
-        axes = [0, 2, -1] if y == x else [0]
-        for axis in axes:
-          outputs.append(array_ops.gather(y, 2, axis=axis))
-          outputs.append(array_ops.gather(y, i, axis=axis))
-          outputs.append(array_ops.gather(y, [i], axis=axis))
-          outputs.append(array_ops.gather(y, [i, 2], axis=axis))
-          outputs.append(array_ops.gather(y, [[2, i], [i, 1]], axis=axis))
-      return outputs
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 20)
-
-  def test_shape(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x_i = array_ops.gather(x, i)
-      return array_ops.shape(x_i), array_ops.shape(x_i, out_type=dtypes.int64)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32, dtypes.int64])
-
-  def test_size(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x_i = array_ops.gather(x, i)
-      return array_ops.size(x_i), array_ops.size(x_i, out_type=dtypes.int64)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32, dtypes.int64])
-
-  def test_rank(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x_i = array_ops.gather(x, i)
-      return array_ops.rank(x_i)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32])
-
-  def test_shape_n(self):
-    x = random_ops.random_uniform([3, 2, 3])
-    y = random_ops.random_uniform([3])
-
-    def loop_fn(i):
-      x_i = array_ops.gather(x, i)
-      y_i = array_ops.gather(y, i)
-      return array_ops.shape_n([x_i, x, y, y_i]), array_ops.shape_n(
-          [x_i, x, y, y_i], out_type=dtypes.int64)
-
-    self._test_loop_fn(
-        loop_fn, 3, loop_fn_dtypes=[dtypes.int32] * 4 + [dtypes.int64] * 4)
-
-  def test_reshape(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.reshape(x1, [-1]), array_ops.reshape(x1, [1, 3, 1, -1])
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
-
-  def test_expand_dims(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.expand_dims(
-          x1, axis=-1), array_ops.expand_dims(
-              x1, axis=1)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
-
-  def test_slice(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.slice(x1, begin=(0, 1), size=(2, 1))
-
-    self._test_loop_fn(loop_fn, 3)
-
-  def test_tile(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.tile(x1, [2, 1])
-
-    self._test_loop_fn(loop_fn, 3)
-
-  def test_tile_loop_dependent(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.tile(x1, [i, 1])
-
-    with self.assertRaisesRegexp(ValueError, "expected to be loop invariant"):
-      pfor_control_flow_ops.pfor(loop_fn, 2)
-
-  def test_pack(self):
-    x = random_ops.random_uniform([3, 2, 3])
-    y = random_ops.random_uniform([2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.stack([x1, y], axis=-1)
-
-    self._test_loop_fn(loop_fn, 1)
-
-  def test_unpack(self):
-    x = random_ops.random_uniform([3, 2, 3, 4])
-
-    def loop_fn(i):
-      x_i = array_ops.gather(x, i)
-      return array_ops.unstack(
-          x_i, 4, axis=-1), array_ops.unstack(
-              x_i, 3, axis=1)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 7)
-
-  def test_pad(self):
-    x = random_ops.random_uniform([3, 2, 3])
-    padding = constant_op.constant([[1, 2], [3, 4]])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.pad(x1, padding, mode="CONSTANT")
-
-    self._test_loop_fn(loop_fn, 3)
-
-  def test_split(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.split(x1, 2, axis=0), array_ops.split(x1, 3, axis=-1)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 5)
-
-  def test_split_v(self):
-    x = random_ops.random_uniform([3, 6, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return (array_ops.split(x1, [2, 1, 3], axis=0),
-              array_ops.split(x1, [3], axis=-1))
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 4)
-
-  def test_transpose(self):
-    x = random_ops.random_uniform([3, 2, 3, 4])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.transpose(x1, [2, 1, 0])
-
-    self._test_loop_fn(loop_fn, 3)
-
-  def test_zeros_like(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      z = array_ops.zeros_like(x1),
-      return z, z + x1
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
-
-  def test_concat_v2(self):
-    x = random_ops.random_uniform([3, 2, 3])
-    y = random_ops.random_uniform([2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.concat(
-          [x1, x1, y], axis=0), array_ops.concat(
-              [x1, x1, y], axis=-1)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
-
-  def test_unary_cwise_ops(self):
-    for op in [array_ops.identity, array_ops.stop_gradient]:
-      with backprop.GradientTape(persistent=True) as g:
-        x = random_ops.random_uniform([3, 5])
-        g.watch(x)
-
-      # pylint: disable=cell-var-from-loop
-      def loop_fn(i):
-        with g:
-          x1 = array_ops.gather(x, i)
-          y = op(x1) + x1
-          loss = nn.l2_loss(y)
-        return op(x), y, g.gradient(loss, x1)
-
-      # pylint: enable=cell-var-from-loop
-
-      self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 3)
-
-  def test_identity_n(self):
-    x = random_ops.random_uniform([3, 4])
-
-    def loop_fn(i):
-      return array_ops.identity_n([x, array_ops.gather(x, i)])
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
-
-  def test_matrix_diag_part(self):
-    x = random_ops.random_uniform([3, 4, 2])
-
-    def loop_fn(i):
-      return array_ops.matrix_diag_part(array_ops.gather(x, i))
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32])
-
-  def test_strided_slice(self):
-    with backprop.GradientTape(persistent=True) as g:
-      x = random_ops.random_uniform([3, 3, 4, 4, 2, 2, 2])
-      g.watch(x)
-
-    def loop_fn(i):
-      with g:
-        x_i = array_ops.gather(x, i)
-        y = x_i[:2, ::2, 1::3, ..., array_ops.newaxis, 1]
-        loss = nn.l2_loss(y)
-      return y, g.gradient(loss, x_i)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
-
-
-@test_util.run_all_in_graph_and_eager_modes
-class BitwiseTest(PForTest):
+class BitwiseTest(PForTestCase):
 
   def test_unary_cwise(self):
     for op in [bitwise_ops.invert]:
@@ -413,376 +141,7 @@ class BitwiseTest(PForTest):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class MathTest(PForTest):
-
-  def test_unary_cwise_ops(self):
-    complex_ops = [
-        math_ops.angle,
-        math_ops.imag,
-        math_ops.complex_abs,
-        math_ops.real,
-        math_ops.conj,
-    ]
-    real_ops = [
-        lambda x: math_ops.acosh(1 + math_ops.square(x)),
-        math_ops.abs,
-        math_ops.acos,
-        math_ops.asin,
-        math_ops.asinh,
-        math_ops.atan,
-        math_ops.atanh,
-        math_ops.bessel_i0e,
-        math_ops.bessel_i1e,
-        math_ops.cos,
-        math_ops.cosh,
-        math_ops.digamma,
-        math_ops.erf,
-        math_ops.erfc,
-        math_ops.exp,
-        math_ops.expm1,
-        math_ops.inv,
-        math_ops.is_finite,
-        math_ops.is_inf,
-        math_ops.lgamma,
-        math_ops.log,
-        math_ops.log1p,
-        math_ops.neg,
-        math_ops.negative,
-        math_ops.reciprocal,
-        math_ops.rint,
-        math_ops.round,
-        math_ops.rsqrt,
-        math_ops.sigmoid,
-        math_ops.sign,
-        math_ops.sin,
-        math_ops.sinh,
-        math_ops.sqrt,
-        math_ops.square,
-        math_ops.tan,
-        math_ops.tanh,
-        math_ops.tanh,
-        nn.elu,
-        nn.relu,
-        nn.relu6,
-        nn.selu,
-        nn.softplus,
-        nn.softsign,
-    ]
-    for op in complex_ops + real_ops:
-      with backprop.GradientTape(persistent=True) as g:
-        x = random_ops.random_uniform([3, 5])
-        g.watch(x)
-        if op in complex_ops:
-          y = random_ops.random_uniform([3, 5])
-          g.watch(y)
-          x = math_ops.complex(x, y)
-
-      # pylint: disable=cell-var-from-loop
-      output_dtypes = []
-      def loop_fn(i):
-        with g:
-          x1 = array_ops.gather(x, i)
-          y1 = op(x1)
-          outputs = [op(x), y1]
-          if y1.dtype == dtypes.float32:
-            loss = math_ops.reduce_sum(y1 * y1)
-          else:
-            loss = None
-        if loss is not None:
-          grad = g.gradient(loss, x1)
-          if grad is not None:
-            outputs.append(grad)
-        del output_dtypes[:]
-        output_dtypes.extend([t.dtype for t in outputs])
-        return outputs
-
-      # pylint: enable=cell-var-from-loop
-
-      self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=output_dtypes)
-
-  def test_unary_cwise_no_grad(self):
-    for op in [math_ops.ceil,
-               math_ops.floor,
-               math_ops.logical_not]:
-      x = random_ops.random_uniform([3, 5])
-      if op == math_ops.logical_not:
-        x = x > 0
-
-      # pylint: disable=cell-var-from-loop
-      def loop_fn(i):
-        return op(array_ops.gather(x, i))
-
-      # pylint: enable=cell-var-from-loop
-
-      self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=x.dtype)
-
-  def test_binary_cwise_ops(self):
-    logical_ops = [
-        math_ops.logical_and,
-        math_ops.logical_or,
-        math_ops.logical_xor
-    ]
-
-    # Wrapper functions restricting the range of inputs of zeta and polygamma.
-    def safe_polygamma(x, y):
-      return math_ops.polygamma(
-          math_ops.round(clip_ops.clip_by_value(y, 1, 10)),
-          x * x + 1)
-
-    def safe_zeta(x, y):
-      return math_ops.zeta(x * x + 1, y * y)
-
-    float_ops = [
-        math_ops.add,
-        math_ops.add_v2,
-        math_ops.atan2,
-        math_ops.complex,
-        math_ops.div,
-        math_ops.divide,
-        math_ops.div_no_nan,
-        math_ops.equal,
-        math_ops.floor_div,
-        math_ops.floor_mod,
-        math_ops.greater,
-        math_ops.greater_equal,
-        math_ops.igamma,
-        math_ops.igammac,
-        math_ops.igamma_grad_a,
-        math_ops.less,
-        math_ops.less_equal,
-        math_ops.maximum,
-        math_ops.minimum,
-        math_ops.mod,
-        math_ops.multiply,
-        math_ops.not_equal,
-        math_ops.pow,
-        math_ops.squared_difference,
-        math_ops.subtract,
-        math_ops.truncate_mod,
-        safe_polygamma,
-        safe_zeta,
-    ]
-    for op in logical_ops + float_ops:
-      x = random_ops.random_uniform([7, 3, 5])
-      y = random_ops.random_uniform([3, 5])
-      if op in logical_ops:
-        x = x > 0
-        y = y > 0
-
-      output_dtypes = []
-      # pylint: disable=cell-var-from-loop
-      def loop_fn(i):
-        x1 = array_ops.gather(x, i)
-        y1 = array_ops.gather(y, i)
-        outputs = [op(x, y), op(x1, y), op(x, y1), op(x1, y1), op(x1, x1)]
-        del output_dtypes[:]
-        output_dtypes.extend([t.dtype for t in outputs])
-        return outputs
-      # pylint: enable=cell-var-from-loop
-
-      self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=output_dtypes)
-
-  def test_approximate_equal(self):
-    x = random_ops.random_uniform([3, 5])
-    y = random_ops.random_uniform([3, 5])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      y1 = array_ops.gather(y, i)
-      return math_ops.approximate_equal(x1, y1)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.bool])
-
-  def test_addn(self):
-    x = random_ops.random_uniform([2, 3, 5])
-    y = random_ops.random_uniform([3, 5])
-    z = random_ops.random_uniform([3, 5])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return math_ops.add_n([x1, y, z])
-
-    self._test_loop_fn(loop_fn, 2)
-
-  def test_matmul(self):
-    for tr_a in (True, False):
-      for tr_b in (True, False):
-        for stack_a in (True, False):
-          for stack_b in (True, False):
-            shape_a = (5, 3) if tr_a else (3, 5)
-            if stack_a:
-              shape_a = (2,) + shape_a
-            shape_b = (7, 5) if tr_b else (5, 7)
-            if stack_b:
-              shape_b = (2,) + shape_b
-
-            x = random_ops.random_uniform(shape_a)
-            y = random_ops.random_uniform(shape_b)
-
-            # pylint: disable=cell-var-from-loop
-            def loop_fn(i):
-              a = array_ops.gather(x, i) if stack_a else x
-              b = array_ops.gather(y, i) if stack_b else y
-              return math_ops.matmul(a, b, transpose_a=tr_a, transpose_b=tr_b)
-
-            # pylint: enable=cell-var-from-loop
-
-            self._test_loop_fn(loop_fn, 2)
-
-  def test_batch_matmul(self):
-    for tr_a in (True, False):
-      for tr_b in (True, False):
-        for stack_a in (True, False):
-          for stack_b in (True, False):
-            shape_a = (4, 5, 3) if tr_a else (4, 3, 5)
-            if stack_a:
-              shape_a = (2,) + shape_a
-            shape_b = (4, 7, 5) if tr_b else (4, 5, 7)
-            if stack_b:
-              shape_b = (2,) + shape_b
-
-            x = random_ops.random_uniform(shape_a)
-            y = random_ops.random_uniform(shape_b)
-
-            # pylint: disable=cell-var-from-loop
-            def loop_fn(i):
-              a = array_ops.gather(x, i) if stack_a else x
-              b = array_ops.gather(y, i) if stack_b else y
-              return math_ops.matmul(a, b, transpose_a=tr_a, transpose_b=tr_b)
-
-            # pylint: enable=cell-var-from-loop
-
-            self._test_loop_fn(loop_fn, 2)
-
-  def test_reduction(self):
-    x = random_ops.random_uniform([2, 3, 4, 5])
-    for op in [
-        math_ops.reduce_sum, math_ops.reduce_prod, math_ops.reduce_max,
-        math_ops.reduce_min
-    ]:
-      for axis in ([1], None, [0, 2]):
-        for keepdims in (True, False):
-
-          # pylint: disable=cell-var-from-loop
-          def loop_fn(i):
-            a = array_ops.gather(x, i)
-            return op(a, axis=axis, keepdims=keepdims)
-
-          # pylint: enable=cell-var-from-loop
-
-          self._test_loop_fn(loop_fn, 2)
-
-  def test_cum_sum(self):
-    x = random_ops.random_uniform([2, 3, 4, 5])
-    for axis in (1, -2):
-      for exclusive in (True, False):
-        for reverse in (True, False):
-
-          # pylint: disable=cell-var-from-loop
-          def loop_fn(i):
-            a = array_ops.gather(x, i)
-            return math_ops.cumsum(
-                a, axis=axis, exclusive=exclusive, reverse=reverse)
-
-          # pylint: enable=cell-var-from-loop
-
-          self._test_loop_fn(loop_fn, 2)
-
-  def test_cum_prod(self):
-    x = random_ops.random_uniform([2, 3, 4, 5])
-    for axis in (1, -2):
-      for exclusive in (True, False):
-        for reverse in (True, False):
-
-          # pylint: disable=cell-var-from-loop
-          def loop_fn(i):
-            a = array_ops.gather(x, i)
-            return math_ops.cumprod(
-                a, axis=axis, exclusive=exclusive, reverse=reverse)
-
-          # pylint: enable=cell-var-from-loop
-
-          self._test_loop_fn(loop_fn, 2)
-
-  def test_bias_add(self):
-    x_shape = [2, 3, 4, 5, 6]
-    x = random_ops.random_uniform(x_shape)
-    for data_format in ("NCHW", "NHWC"):
-      with backprop.GradientTape(persistent=True) as g:
-        bias_dim = 2 if data_format == "NCHW" else -1
-        bias_shape = x_shape[bias_dim]
-        bias = random_ops.random_uniform([bias_shape])
-        g.watch(bias)
-
-      # pylint: disable=cell-var-from-loop
-      def loop_fn(i):
-        with g:
-          a = array_ops.gather(x, i)
-          y = nn.bias_add(a, bias, data_format=data_format)
-          loss = math_ops.reduce_sum(y * y)
-        return y, g.gradient(loss, bias)
-      # pylint: enable=cell-var-from-loop
-
-      self._test_loop_fn(
-          loop_fn, 2, loop_fn_dtypes=[dtypes.float32, dtypes.float32])
-
-  def test_unsorted_segment_sum(self):
-    t = random_ops.random_uniform([3, 3, 2])
-    segment_ids = constant_op.constant([[0, 0, 2], [0, 1, 2], [2, 2, 2]])
-    num_segments = 3
-
-    def loop_fn(i):
-      data = array_ops.gather(t, i)
-      data_0 = array_ops.gather(t, 0)
-      seg_ids = array_ops.gather(segment_ids, i)
-      return (math_ops.unsorted_segment_sum(data, seg_ids, num_segments),
-              math_ops.unsorted_segment_sum(data_0, seg_ids, num_segments))
-
-    self._test_loop_fn(loop_fn, 3, [dtypes.float32] * 2)
-
-  def test_cast(self):
-    x = constant_op.constant([[1], [2]])
-    y = constant_op.constant([[1.0], [2.0]])
-
-    def loop_fn(i):
-      return (math_ops.cast(array_ops.gather(x, i), dtypes.float32),
-              math_ops.cast(array_ops.gather(y, i), dtypes.int32))
-
-    self._test_loop_fn(
-        loop_fn, 2, loop_fn_dtypes=[dtypes.float32, dtypes.int32])
-
-  def test_tanh_axpy(self):
-    a = constant_op.constant(3.)
-    x = random_ops.random_uniform([4, 5])
-    y = random_ops.random_uniform([6, 5])
-    n = x.shape[0]
-
-    def loop_fn(i):
-      return math_ops.tanh(a * array_ops.gather(x, i) + array_ops.gather(y, i))
-
-    self._test_loop_fn(loop_fn, n)
-
-  def test_select(self):
-    cond = constant_op.constant([True, False])
-    a = random_ops.random_uniform([2, 3, 5])
-    b = random_ops.random_uniform([2, 3, 5])
-    for cond_shape in [2], [2, 3], [2, 3, 5]:
-      cond = random_ops.random_uniform(cond_shape) > 0.5
-
-      # pylint: disable=cell-var-from-loop
-      def loop_fn(i):
-        a_i = array_ops.gather(a, i)
-        b_i = array_ops.gather(b, i)
-        cond_i = array_ops.gather(cond, i)
-        return array_ops.where(cond_i, a_i, b_i)
-
-      # pylint: enable=cell-var-from-loop
-
-      self._test_loop_fn(loop_fn, 2)
-
-
-@test_util.run_all_in_graph_and_eager_modes
-class NNTest(PForTest):
+class NNTest(PForTestCase):
 
   def test_conv2d(self):
     x = random_ops.random_uniform([3, 2, 12, 12, 3])
@@ -961,7 +320,7 @@ class NNTest(PForTest):
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
 
 
-class RandomTest(PForTest):
+class RandomTest(PForTestCase):
 
   # The random values generated in the two implementations are not guaranteed to
   # match. So we only check the returned shapes.
@@ -1014,8 +373,9 @@ class RandomTest(PForTest):
     self._test_loop_fn(loop_fn, 5)
 
 
-class LoggingTest(PForTest):
+class LoggingTest(PForTestCase):
 
+  @test_util.run_v1_only("b/122612051")
   def test_print(self):
     x = random_ops.random_uniform([3, 5])
 
@@ -1036,8 +396,9 @@ class LoggingTest(PForTest):
       sess.run(pfor_control_flow_ops.pfor(loop_fn, 3))
 
 
-class TensorArrayTest(PForTest):
+class TensorArrayTest(PForTestCase):
 
+  @test_util.run_v1_only("b/122612051")
   def test_create_outside_and_read(self):
 
     ta = tensor_array_ops.TensorArray(
@@ -1048,6 +409,7 @@ class TensorArrayTest(PForTest):
 
     self._test_loop_fn(loop_fn, 2, [dtypes.int32] * 2)
 
+  @test_util.run_v1_only("b/122612051")
   def test_create_outside_and_gather(self):
 
     ta = tensor_array_ops.TensorArray(
@@ -1058,6 +420,7 @@ class TensorArrayTest(PForTest):
 
     self._test_loop_fn(loop_fn, 2, [dtypes.int32] * 2)
 
+  @test_util.run_v1_only("b/122612051")
   def test_create_outside_and_write_and_scatter(self):
 
     t = tensor_array_ops.TensorArray(dtypes.int32, 10, clear_after_read=False)
@@ -1079,6 +442,7 @@ class TensorArrayTest(PForTest):
     output2 = self._run_targets(out2)
     self.assertAllClose(output2, output1)
 
+  @test_util.run_v1_only("b/122612051")
   def test_create_inside_and_write(self):
 
     def loop_fn(i):
@@ -1090,6 +454,7 @@ class TensorArrayTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, [dtypes.int32] * 2)
 
+  @test_util.run_v1_only("b/122612051")
   def test_create_inside_and_scatter(self):
 
     def loop_fn(i):
@@ -1102,6 +467,7 @@ class TensorArrayTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, [dtypes.int32] * 2)
 
+  @test_util.run_v1_only("b/122612051")
   def test_create_inside_and_read(self):
 
     def loop_fn(i):
@@ -1114,6 +480,7 @@ class TensorArrayTest(PForTest):
 
     self._test_loop_fn(loop_fn, 2, [dtypes.int32] * 3)
 
+  @test_util.run_v1_only("b/122612051")
   def test_create_inside_and_gather(self):
 
     def loop_fn(i):
@@ -1126,6 +493,7 @@ class TensorArrayTest(PForTest):
 
     self._test_loop_fn(loop_fn, 2, [dtypes.int32] * 3)
 
+  @test_util.run_v1_only("b/122612051")
   def test_grad(self):
     x = random_ops.random_uniform([3, 2])
     ta = tensor_array_ops.TensorArray(
@@ -1145,8 +513,9 @@ class TensorArrayTest(PForTest):
       self.assertAllClose(actual_grad, computed_grad)
 
 
-class StackTest(PForTest):
+class StackTest(PForTestCase):
 
+  @test_util.run_v1_only("b/122612051")
   def test_stack_inside_loop_invariant(self):
 
     def loop_fn(_):
@@ -1162,6 +531,7 @@ class StackTest(PForTest):
 
     self._test_loop_fn(loop_fn, 2, [dtypes.int32] * 2)
 
+  @test_util.run_v1_only("b/122612051")
   def test_stack_inside_push_loop_dependent(self):
 
     def loop_fn(i):
@@ -1177,6 +547,7 @@ class StackTest(PForTest):
 
     self._test_loop_fn(loop_fn, 2, [dtypes.int32] * 2)
 
+  @test_util.run_v1_only("b/122612051")
   def test_stack_outside_pop(self):
     s = data_flow_ops.stack_v2(max_size=4, elem_type=dtypes.int32)
     op = data_flow_ops.stack_push_v2(s, 5)
@@ -1200,6 +571,7 @@ class StackTest(PForTest):
     self.assertAllEqual([6, 6], v2)
     self.assertAllEqual(5, v3)
 
+  @test_util.run_v1_only("b/122612051")
   def test_stack_outside_push(self):
     s = data_flow_ops.stack_v2(max_size=4, elem_type=dtypes.int32)
 
@@ -1212,7 +584,7 @@ class StackTest(PForTest):
 
 # TODO(agarwal): test nested while_loops. This currently requires converting a
 # tf.cond.
-class ControlFlowTest(PForTest):
+class ControlFlowTest(PForTestCase):
 
   def test_while_outside_loop(self):
 
@@ -1223,6 +595,7 @@ class ControlFlowTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32])
 
+  @test_util.run_v1_only("b/122612051")
   def test_invariant_while(self):
 
     def loop_fn(_):
@@ -1230,6 +603,7 @@ class ControlFlowTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32])
 
+  @test_util.run_v1_only("b/122612051")
   def test_invariant_while_with_control_dependency(self):
 
     def loop_fn(i):
@@ -1239,6 +613,7 @@ class ControlFlowTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32])
 
+  @test_util.run_v1_only("b/122612051")
   def test_while_with_stateful_ops(self):
 
     def loop_fn(_):
@@ -1248,6 +623,7 @@ class ControlFlowTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32])
 
+  @test_util.run_v1_only("b/122612051")
   def test_while_unstacked_condition(self):
 
     def loop_fn(i):
@@ -1256,6 +632,7 @@ class ControlFlowTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32, dtypes.int32])
 
+  @test_util.run_v1_only("b/122612051")
   def test_while(self):
     x = random_ops.random_uniform([3, 5])
     lengths = constant_op.constant([4, 0, 2])
@@ -1271,6 +648,7 @@ class ControlFlowTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32])
 
+  @test_util.run_v1_only("b/122612051")
   def test_while_jacobian(self):
     x = random_ops.random_uniform([1, 3])
     y = random_ops.random_uniform([3, 3])
@@ -1298,6 +676,7 @@ class ControlFlowTest(PForTest):
       out, expected = sess.run([out, expected_output])
       self.assertAllClose(expected, out)
 
+  @test_util.run_v1_only("b/122612051")
   def test_tensor_array_as_loop_variable(self):
 
     def loop_fn(i):
@@ -1313,6 +692,7 @@ class ControlFlowTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32])
 
+  @test_util.run_v1_only("b/122612051")
   def test_read_tensor_array_partitioned_indices(self):
     # Note that tensor array values are pfor loop dependent, and the while loop
     # termination condition is also dependent on pfor iteration.
@@ -1330,6 +710,7 @@ class ControlFlowTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32])
 
+  @test_util.run_v1_only("b/122612051")
   def test_external_while_loop_grad(self):
     # Here we test that external while_loops that are extended from inside pfor
     # (due to gradient calls) are not actually converted. If the below was
@@ -1355,6 +736,7 @@ class ControlFlowTest(PForTest):
       self.assertAllEqual([1, 1, 1],
                           sess.run(pfor_control_flow_ops.pfor(loop_fn, 3)))
 
+  @test_util.run_v1_only("b/122612051")
   def test_tensor_array_grad(self):
     inp = constant_op.constant(np.random.rand(3, 4, 2), dtype=dtypes.float32)
     ta = tensor_array_ops.TensorArray(dtypes.float32, size=3)
@@ -1452,13 +834,15 @@ def create_dynamic_lstm(cell_fn, batch_size, state_size, max_steps):
   return pfor_output, tf_output
 
 
-class RNNTest(PForTest):
+class RNNTest(PForTestCase):
 
+  @test_util.run_v1_only("b/122612051")
   def test_dynamic_rnn(self):
     pfor_outputs, tf_outputs = create_dynamic_lstm(rnn_cell.BasicRNNCell,
                                                    3, 5, 7)
     self.run_and_assert_equal(pfor_outputs, tf_outputs)
 
+  @test_util.run_v1_only("b/122612051")
   def test_dynamic_lstm(self):
     pfor_outputs, tf_outputs = create_dynamic_lstm(rnn_cell.BasicLSTMCell,
                                                    3, 5, 7)
@@ -1581,8 +965,9 @@ class Benchmarks(test.Benchmark):
       self._run(tf_outputs, 100, name="tf_rnn")
 
 
-class SparseTest(PForTest):
+class SparseTest(PForTestCase):
 
+  @test_util.run_v1_only("b/122612051")
   def test_var_loop_len(self):
     num_iters = array_ops.placeholder(dtypes.int32)
 
@@ -1594,6 +979,7 @@ class SparseTest(PForTest):
     with self.cached_session() as sess:
       sess.run(pfor, feed_dict={num_iters: 3})
 
+  @test_util.run_v1_only("b/122612051")
   def test_sparse_result_none_stacked(self):
     num_iters = 10
 
@@ -1610,6 +996,7 @@ class SparseTest(PForTest):
     manual = sparse_tensor.SparseTensor(indices, values, dense_shapes)
     self.run_and_assert_equal(pfor, manual)
 
+  @test_util.run_v1_only("b/122612051")
   def test_sparse_result_all_stacked(self):
     num_iters = 10
 
@@ -1625,6 +1012,7 @@ class SparseTest(PForTest):
                                         (num_iters, num_iters))
     self.run_and_assert_equal(pfor, manual)
 
+  @test_util.run_v1_only("b/122612051")
   def test_sparse_result_indices_stacked(self):
     num_iters = 10
 
@@ -1639,6 +1027,7 @@ class SparseTest(PForTest):
                                         [1] * num_iters, (num_iters, num_iters))
     self.run_and_assert_equal(pfor, manual)
 
+  @test_util.run_v1_only("b/122612051")
   def test_sparse_result_values_stacked(self):
     num_iters = 10
 
@@ -1653,6 +1042,7 @@ class SparseTest(PForTest):
                                         (num_iters, num_iters))
     self.run_and_assert_equal(pfor, manual)
 
+  @test_util.run_v1_only("b/122612051")
   def test_sparse_result_shapes_stacked(self):
     num_iters = 10
 
@@ -1666,6 +1056,7 @@ class SparseTest(PForTest):
                                         [1] * num_iters, (num_iters, num_iters))
     self.run_and_assert_equal(pfor, manual)
 
+  @test_util.run_v1_only("b/122612051")
   def test_sparse_result_shapes_stacked_2D(self):
     num_iters = 10
 
@@ -1682,7 +1073,7 @@ class SparseTest(PForTest):
     self.run_and_assert_equal(pfor, manual)
 
 
-class ParsingTest(PForTest):
+class ParsingTest(PForTestCase):
 
   def test_decode_csv(self):
     csv_tensor = constant_op.constant([["1:2:3"], ["::"], ["7:8:9"]])
@@ -1694,6 +1085,7 @@ class ParsingTest(PForTest):
 
     self._test_loop_fn(loop_fn, iters=3, loop_fn_dtypes=[dtypes.int32] * 3)
 
+  @test_util.run_v1_only("b/122612051")
   def test_parse_single_example(self):
 
     def _int64_feature(*values):
diff --git a/tensorflow/python/ops/parallel_for/gradients_test.py b/tensorflow/python/ops/parallel_for/gradients_test.py
index 4342833e3eb362e81ff9f60b4649cc5b8de6250f..133e7909922cf6c17a7960f9b993662f222cc446 100644
--- a/tensorflow/python/ops/parallel_for/gradients_test.py
+++ b/tensorflow/python/ops/parallel_for/gradients_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training as keras_training
 from tensorflow.python.layers import layers as tf_layers
 from tensorflow.python.ops import array_ops
@@ -338,6 +339,7 @@ def create_fc_per_eg_jacobians(batch_size, activation_size, num_layers):
   return jacobians, per_eg_jacobians_pfor, per_eg_jacobians_while
 
 
+@test_util.run_v1_only("b/122612051")
 class GradientsTest(test.TestCase):
 
   def run_and_assert_equal(self, targets1, targets2, atol=1e-4, rtol=1e-4):
diff --git a/tensorflow/python/ops/parallel_for/math_test.py b/tensorflow/python/ops/parallel_for/math_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..db88f4fe0332afe8de312da65b9643a24a056bcb
--- /dev/null
+++ b/tensorflow/python/ops/parallel_for/math_test.py
@@ -0,0 +1,405 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for vectorization of math kernels."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
+from tensorflow.python.ops.parallel_for.test_util import PForTestCase
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MathTest(PForTestCase):
+
+  def test_unary_cwise_ops(self):
+    complex_ops = [
+        math_ops.angle,
+        math_ops.imag,
+        math_ops.complex_abs,
+        math_ops.real,
+        math_ops.conj,
+    ]
+    real_ops = [
+        lambda x: math_ops.acosh(1 + math_ops.square(x)),
+        math_ops.abs,
+        math_ops.acos,
+        math_ops.asin,
+        math_ops.asinh,
+        math_ops.atan,
+        math_ops.atanh,
+        math_ops.bessel_i0e,
+        math_ops.bessel_i1e,
+        math_ops.cos,
+        math_ops.cosh,
+        math_ops.digamma,
+        math_ops.erf,
+        math_ops.erfc,
+        math_ops.exp,
+        math_ops.expm1,
+        math_ops.inv,
+        math_ops.is_finite,
+        math_ops.is_inf,
+        math_ops.lgamma,
+        math_ops.log,
+        math_ops.log1p,
+        math_ops.neg,
+        math_ops.negative,
+        math_ops.reciprocal,
+        math_ops.rint,
+        math_ops.round,
+        math_ops.rsqrt,
+        math_ops.sigmoid,
+        math_ops.sign,
+        math_ops.sin,
+        math_ops.sinh,
+        math_ops.sqrt,
+        math_ops.square,
+        math_ops.tan,
+        math_ops.tanh,
+        math_ops.tanh,
+        nn.elu,
+        nn.relu,
+        nn.relu6,
+        nn.selu,
+        nn.softplus,
+        nn.softsign,
+    ]
+    for op in complex_ops + real_ops:
+      with backprop.GradientTape(persistent=True) as g:
+        x = random_ops.random_uniform([3, 5])
+        g.watch(x)
+        if op in complex_ops:
+          y = random_ops.random_uniform([3, 5])
+          g.watch(y)
+          x = math_ops.complex(x, y)
+
+      # pylint: disable=cell-var-from-loop
+      output_dtypes = []
+      def loop_fn(i):
+        with g:
+          x1 = array_ops.gather(x, i)
+          y1 = op(x1)
+          outputs = [op(x), y1]
+          if y1.dtype == dtypes.float32:
+            loss = math_ops.reduce_sum(y1 * y1)
+          else:
+            loss = None
+        if loss is not None:
+          grad = g.gradient(loss, x1)
+          if grad is not None:
+            outputs.append(grad)
+        del output_dtypes[:]
+        output_dtypes.extend([t.dtype for t in outputs])
+        return outputs
+
+      # pylint: enable=cell-var-from-loop
+
+      self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=output_dtypes)
+
+  def test_unary_cwise_no_grad(self):
+    for op in [math_ops.ceil,
+               math_ops.floor,
+               math_ops.logical_not]:
+      x = random_ops.random_uniform([3, 5])
+      if op == math_ops.logical_not:
+        x = x > 0
+
+      # pylint: disable=cell-var-from-loop
+      def loop_fn(i):
+        return op(array_ops.gather(x, i))
+
+      # pylint: enable=cell-var-from-loop
+
+      self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=x.dtype)
+
+  def test_binary_cwise_ops(self):
+    logical_ops = [
+        math_ops.logical_and,
+        math_ops.logical_or,
+        math_ops.logical_xor
+    ]
+
+    # Wrapper functions restricting the range of inputs of zeta and polygamma.
+    def safe_polygamma(x, y):
+      return math_ops.polygamma(
+          math_ops.round(clip_ops.clip_by_value(y, 1, 10)),
+          x * x + 1)
+
+    def safe_zeta(x, y):
+      return math_ops.zeta(x * x + 1, y * y)
+
+    float_ops = [
+        math_ops.add,
+        math_ops.add_v2,
+        math_ops.atan2,
+        math_ops.complex,
+        math_ops.div,
+        math_ops.divide,
+        math_ops.div_no_nan,
+        math_ops.equal,
+        math_ops.floor_div,
+        math_ops.floor_mod,
+        math_ops.greater,
+        math_ops.greater_equal,
+        math_ops.igamma,
+        math_ops.igammac,
+        math_ops.igamma_grad_a,
+        math_ops.less,
+        math_ops.less_equal,
+        math_ops.maximum,
+        math_ops.minimum,
+        math_ops.mod,
+        math_ops.multiply,
+        math_ops.not_equal,
+        math_ops.pow,
+        math_ops.squared_difference,
+        math_ops.subtract,
+        math_ops.truncate_mod,
+        safe_polygamma,
+        safe_zeta,
+    ]
+    for op in logical_ops + float_ops:
+      x = random_ops.random_uniform([7, 3, 5])
+      y = random_ops.random_uniform([3, 5])
+      if op in logical_ops:
+        x = x > 0
+        y = y > 0
+
+      output_dtypes = []
+      # pylint: disable=cell-var-from-loop
+      def loop_fn(i):
+        x1 = array_ops.gather(x, i)
+        y1 = array_ops.gather(y, i)
+        outputs = [op(x, y), op(x1, y), op(x, y1), op(x1, y1), op(x1, x1)]
+        del output_dtypes[:]
+        output_dtypes.extend([t.dtype for t in outputs])
+        return outputs
+      # pylint: enable=cell-var-from-loop
+
+      self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=output_dtypes)
+
+  def test_approximate_equal(self):
+    x = random_ops.random_uniform([3, 5])
+    y = random_ops.random_uniform([3, 5])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      y1 = array_ops.gather(y, i)
+      return math_ops.approximate_equal(x1, y1)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.bool])
+
+  def test_addn(self):
+    x = random_ops.random_uniform([2, 3, 5])
+    y = random_ops.random_uniform([3, 5])
+    z = random_ops.random_uniform([3, 5])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return math_ops.add_n([x1, y, z])
+
+    self._test_loop_fn(loop_fn, 2)
+
+  def test_matmul(self):
+    for tr_a in (True, False):
+      for tr_b in (True, False):
+        for stack_a in (True, False):
+          for stack_b in (True, False):
+            shape_a = (5, 3) if tr_a else (3, 5)
+            if stack_a:
+              shape_a = (2,) + shape_a
+            shape_b = (7, 5) if tr_b else (5, 7)
+            if stack_b:
+              shape_b = (2,) + shape_b
+
+            x = random_ops.random_uniform(shape_a)
+            y = random_ops.random_uniform(shape_b)
+
+            # pylint: disable=cell-var-from-loop
+            def loop_fn(i):
+              a = array_ops.gather(x, i) if stack_a else x
+              b = array_ops.gather(y, i) if stack_b else y
+              return math_ops.matmul(a, b, transpose_a=tr_a, transpose_b=tr_b)
+
+            # pylint: enable=cell-var-from-loop
+
+            self._test_loop_fn(loop_fn, 2)
+
+  def test_batch_matmul(self):
+    for tr_a in (True, False):
+      for tr_b in (True, False):
+        for stack_a in (True, False):
+          for stack_b in (True, False):
+            shape_a = (4, 5, 3) if tr_a else (4, 3, 5)
+            if stack_a:
+              shape_a = (2,) + shape_a
+            shape_b = (4, 7, 5) if tr_b else (4, 5, 7)
+            if stack_b:
+              shape_b = (2,) + shape_b
+
+            x = random_ops.random_uniform(shape_a)
+            y = random_ops.random_uniform(shape_b)
+
+            # pylint: disable=cell-var-from-loop
+            def loop_fn(i):
+              a = array_ops.gather(x, i) if stack_a else x
+              b = array_ops.gather(y, i) if stack_b else y
+              return math_ops.matmul(a, b, transpose_a=tr_a, transpose_b=tr_b)
+
+            # pylint: enable=cell-var-from-loop
+
+            self._test_loop_fn(loop_fn, 2)
+
+  def test_reduction(self):
+    x = random_ops.random_uniform([2, 3, 4, 5])
+    for op in [
+        math_ops.reduce_sum, math_ops.reduce_prod, math_ops.reduce_max,
+        math_ops.reduce_min
+    ]:
+      for axis in ([1], None, [0, 2]):
+        for keepdims in (True, False):
+
+          # pylint: disable=cell-var-from-loop
+          def loop_fn(i):
+            a = array_ops.gather(x, i)
+            return op(a, axis=axis, keepdims=keepdims)
+
+          # pylint: enable=cell-var-from-loop
+
+          self._test_loop_fn(loop_fn, 2)
+
+  def test_cum_sum(self):
+    x = random_ops.random_uniform([2, 3, 4, 5])
+    for axis in (1, -2):
+      for exclusive in (True, False):
+        for reverse in (True, False):
+
+          # pylint: disable=cell-var-from-loop
+          def loop_fn(i):
+            a = array_ops.gather(x, i)
+            return math_ops.cumsum(
+                a, axis=axis, exclusive=exclusive, reverse=reverse)
+
+          # pylint: enable=cell-var-from-loop
+
+          self._test_loop_fn(loop_fn, 2)
+
+  def test_cum_prod(self):
+    x = random_ops.random_uniform([2, 3, 4, 5])
+    for axis in (1, -2):
+      for exclusive in (True, False):
+        for reverse in (True, False):
+
+          # pylint: disable=cell-var-from-loop
+          def loop_fn(i):
+            a = array_ops.gather(x, i)
+            return math_ops.cumprod(
+                a, axis=axis, exclusive=exclusive, reverse=reverse)
+
+          # pylint: enable=cell-var-from-loop
+
+          self._test_loop_fn(loop_fn, 2)
+
+  def test_bias_add(self):
+    x_shape = [2, 3, 4, 5, 6]
+    x = random_ops.random_uniform(x_shape)
+    for data_format in ("NCHW", "NHWC"):
+      with backprop.GradientTape(persistent=True) as g:
+        bias_dim = 2 if data_format == "NCHW" else -1
+        bias_shape = x_shape[bias_dim]
+        bias = random_ops.random_uniform([bias_shape])
+        g.watch(bias)
+
+      # pylint: disable=cell-var-from-loop
+      def loop_fn(i):
+        with g:
+          a = array_ops.gather(x, i)
+          y = nn.bias_add(a, bias, data_format=data_format)
+          loss = math_ops.reduce_sum(y * y)
+        return y, g.gradient(loss, bias)
+      # pylint: enable=cell-var-from-loop
+
+      self._test_loop_fn(
+          loop_fn, 2, loop_fn_dtypes=[dtypes.float32, dtypes.float32])
+
+  def test_unsorted_segment_sum(self):
+    t = random_ops.random_uniform([3, 3, 2])
+    segment_ids = constant_op.constant([[0, 0, 2], [0, 1, 2], [2, 2, 2]])
+    num_segments = 3
+
+    def loop_fn(i):
+      data = array_ops.gather(t, i)
+      data_0 = array_ops.gather(t, 0)
+      seg_ids = array_ops.gather(segment_ids, i)
+      return (math_ops.unsorted_segment_sum(data, seg_ids, num_segments),
+              math_ops.unsorted_segment_sum(data_0, seg_ids, num_segments))
+
+    self._test_loop_fn(loop_fn, 3, [dtypes.float32] * 2)
+
+  def test_cast(self):
+    x = constant_op.constant([[1], [2]])
+    y = constant_op.constant([[1.0], [2.0]])
+
+    def loop_fn(i):
+      return (math_ops.cast(array_ops.gather(x, i), dtypes.float32),
+              math_ops.cast(array_ops.gather(y, i), dtypes.int32))
+
+    self._test_loop_fn(
+        loop_fn, 2, loop_fn_dtypes=[dtypes.float32, dtypes.int32])
+
+  def test_tanh_axpy(self):
+    a = constant_op.constant(3.)
+    x = random_ops.random_uniform([4, 5])
+    y = random_ops.random_uniform([6, 5])
+    n = x.shape[0]
+
+    def loop_fn(i):
+      return math_ops.tanh(a * array_ops.gather(x, i) + array_ops.gather(y, i))
+
+    self._test_loop_fn(loop_fn, n)
+
+  def test_select(self):
+    cond = constant_op.constant([True, False])
+    a = random_ops.random_uniform([2, 3, 5])
+    b = random_ops.random_uniform([2, 3, 5])
+    for cond_shape in [2], [2, 3], [2, 3, 5]:
+      cond = random_ops.random_uniform(cond_shape) > 0.5
+
+      # pylint: disable=cell-var-from-loop
+      def loop_fn(i):
+        a_i = array_ops.gather(a, i)
+        b_i = array_ops.gather(b, i)
+        cond_i = array_ops.gather(cond, i)
+        return array_ops.where(cond_i, a_i, b_i)
+
+      # pylint: enable=cell-var-from-loop
+
+      self._test_loop_fn(loop_fn, 2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/parallel_for/test_util.py b/tensorflow/python/ops/parallel_for/test_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b4ef2239e5dc2eb7614d167777821437ae1e812
--- /dev/null
+++ b/tensorflow/python/ops/parallel_for/test_util.py
@@ -0,0 +1,59 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test utility."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.parallel_for import control_flow_ops as pfor_control_flow_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import nest
+
+
+class PForTestCase(test.TestCase):
+  """Base class for test cases."""
+
+  def _run_targets(self, targets1, targets2=None, run_init=True):
+    targets1 = nest.flatten(targets1)
+    targets2 = ([] if targets2 is None else nest.flatten(targets2))
+    assert len(targets1) == len(targets2) or not targets2
+    if run_init:
+      init = variables.global_variables_initializer()
+      self.evaluate(init)
+    return self.evaluate(targets1 + targets2)
+
+  def run_and_assert_equal(self, targets1, targets2):
+    outputs = self._run_targets(targets1, targets2)
+    outputs = nest.flatten(outputs)  # flatten SparseTensorValues
+    n = len(outputs) // 2
+    for i in range(n):
+      if outputs[i + n].dtype != np.object:
+        self.assertAllClose(outputs[i + n], outputs[i], rtol=1e-4, atol=1e-5)
+      else:
+        self.assertAllEqual(outputs[i + n], outputs[i])
+
+  def _test_loop_fn(self, loop_fn, iters,
+                    loop_fn_dtypes=dtypes.float32,
+                    parallel_iterations=None):
+    t1 = pfor_control_flow_ops.pfor(loop_fn, iters=iters,
+                                    parallel_iterations=parallel_iterations)
+    t2 = pfor_control_flow_ops.for_loop(loop_fn, loop_fn_dtypes, iters=iters,
+                                        parallel_iterations=parallel_iterations)
+    self.run_and_assert_equal(t1, t2)
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
index 89b8c4a2b305e7cd584d8bc215ae30490572f2e4..46f7fa62a380f9b6642f27fec3cf8ec8868ae06f 100644
--- a/tensorflow/python/ops/ragged/BUILD
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -720,6 +720,19 @@ py_test(
     ],
 )
 
+py_test(
+    name = "ragged_rank_op_test",
+    srcs = ["ragged_rank_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        ":ragged_test_util",
+        "//tensorflow/python:framework_test_lib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "ragged_tile_op_test",
     srcs = ["ragged_tile_op_test.py"],
diff --git a/tensorflow/python/ops/ragged/ragged_array_ops.py b/tensorflow/python/ops/ragged/ragged_array_ops.py
index 3c5e697cdc22dba34729f4d2e6a90f167516fa89..a6b2442f0931764c9c34bd7dfbf90f0f5ca2c7bb 100644
--- a/tensorflow/python/ops/ragged/ragged_array_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_array_ops.py
@@ -1226,3 +1226,33 @@ def _nrows(rt_input, out_type=dtypes.int64, name=None):
   else:
     with ops.name_scope(name, 'RaggedNRows', [rt_input]):
       return array_ops.shape(rt_input, out_type=out_type)[0]
+
+
+#===============================================================================
+# ragged.rank
+#===============================================================================
+def rank(input, name=None):  # pylint: disable=redefined-builtin
+  """Returns the rank of a RaggedTensor.
+
+  Returns a 0-D `int32` `Tensor` representing the rank of `input`.
+
+  For example:
+
+  ```python
+  # shape of tensor 't' is [2, None, None]
+  t = tf.ragged.constant([[[1], [2, 2]], [[3, 3, 3], [4, 4, 4, 4]]])
+  tf.rank(t)  # 3
+  ```
+
+  Args:
+    input: A `RaggedTensor`
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type `int32`.
+  """
+  with ops.name_scope(name, 'RaggedRank', [input]) as name:
+    if not ragged_tensor.is_ragged(input):
+      return array_ops.rank(input, name)
+
+    return input.ragged_rank + array_ops.rank(input.flat_values)
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch.py b/tensorflow/python/ops/ragged/ragged_dispatch.py
index b024bd9f07e285bbff5f7d9420df1b9582cde524..52c41423467ffaac80a51500d22c261b7b3fd143 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch.py
@@ -418,6 +418,7 @@ _RAGGED_DISPATCH_OPS = [
     (array_ops.gather, _ragged_gather_v1, ['params', 'indices']),
     (array_ops.gather_v2, ragged_array_ops.gather, ['params', 'indices']),
     (array_ops.gather_nd, ragged_array_ops.gather_nd, ['params', 'indices']),
+    (array_ops.rank, ragged_array_ops.rank, ['input']),
     (array_ops.stack, ragged_array_ops.stack, ['[values]']),
     (array_ops.tile, ragged_array_ops.tile, ['input']),
     (array_ops.where, ragged_array_ops.where, ['condition', 'x', 'y']),
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch_test.py b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
index 9d70470f05a292e09def389505779b92041f2e99..0c546accc3f9c956a31ef424aa2b74fbcd4a4af0 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch_test.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
@@ -676,6 +676,10 @@ class RaggedElementwiseOpsTest(ragged_test_util.RaggedTensorTestCase,
                   1
           },
           expected=[False, True]),
+      dict(
+          op=array_ops.rank,
+          kwargs={'input': ragged_factory_ops.constant_value([[8, 3], [5]])},
+          expected=2),
   ])
   def testRaggedDispatch(self, op, expected, args=(), kwargs=None):
     if kwargs is None: kwargs = {}
diff --git a/tensorflow/python/ops/ragged/ragged_rank_op_test.py b/tensorflow/python/ops/ragged/ragged_rank_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..54eee3bc0425852e82858684509838e5812dffde
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_rank_op_test.py
@@ -0,0 +1,89 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.rank op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from absl.testing import parameterized
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedRankOpTest(ragged_test_util.RaggedTensorTestCase,
+                       parameterized.TestCase):
+
+  @parameterized.parameters([
+      # Rank 0
+      dict(
+          test_input=1,
+          expected_rank=0,
+      ),
+      # Rank 1
+      dict(
+          test_input=[1],
+          expected_rank=1,
+      ),
+      dict(
+          test_input=[1, 2, 3, 4],
+          expected_rank=1,
+      ),
+      # Rank 2
+      dict(
+          test_input=[[1], [2], [3]],
+          expected_rank=2,
+      ),
+      # Rank 3
+      dict(
+          test_input=[[[1], [2, 3]], [[4], [5, 6, 7]]],
+          expected_rank=3,
+      ),
+      # Rank 3, ragged_rank=2
+      dict(
+          test_input=[[[1], [2, 3], [10, 20]],
+                      [[4], [5, 6, 7]]],
+          expected_rank=3,
+          ragged_rank=2,
+      ),
+      # Rank 4, ragged_rank=3 with dimensions: {2, (1, 2), (2), (1, 2)}
+      dict(
+          test_input=[[[[1], [2]]],
+                      [[[3, 4], [5, 6]], [[7, 8], [9, 10]]]],
+          expected_rank=4,
+      ),
+      # Rank 4, ragged_rank=2 with dimensions: {2, (1, 2), (1, 2), 2}
+      dict(
+          test_input=[
+              [[[1, 2]]],
+              [[[5, 6], [7, 8]],
+               [[9, 10], [11, 12]]]],
+          expected_rank=4,
+          ragged_rank=2,
+      ),
+
+  ])
+  def testRaggedRank(self, test_input, expected_rank, ragged_rank=None):
+    test_input = ragged_factory_ops.constant(
+        test_input, ragged_rank=ragged_rank)
+    self.assertAllEqual(ragged_array_ops.rank(
+        test_input), expected_rank)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 3e26463ddd0806414ab5e212ca77309f78c6dfc4..2d46e2107490cbe80dae09060949340120d3f214 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 import contextlib
+import functools
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import variable_pb2
@@ -36,6 +37,7 @@ from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
@@ -160,8 +162,7 @@ def shape_safe_assign_variable_handle(handle, shape, value, name=None):
                                                       name=name)
 
 
-# TODO(apassos) make this be variables.Variable
-class ResourceVariable(variables.RefVariable):
+class ResourceVariable(variables.VariableV1):
   """Variable based on resource handles.
 
   See the [Variables How To](https://tensorflow.org/guide/variables)
@@ -297,6 +298,15 @@ class ResourceVariable(variables.RefVariable):
           dtype=dtype,
           constraint=constraint)
 
+  def __repr__(self):
+    if context.executing_eagerly() and not self._in_graph_mode:
+      return "<tf.Variable '%s' shape=%s dtype=%s, numpy=%s>" % (
+          self.name, self.get_shape(), self.dtype.name,
+          ops.numpy_text(self.read_value(), is_repr=True))
+    else:
+      return "<tf.Variable '%s' shape=%s dtype=%s>" % (
+          self.name, self.get_shape(), self.dtype.name)
+
   # pylint: disable=unused-argument
   def _init_from_args(self,
                       initial_value=None,
@@ -394,10 +404,12 @@ class ResourceVariable(variables.RefVariable):
         handle_name = ops._name_from_scope_name(name)
         if self._in_graph_mode:
           shared_name = handle_name
+          unique_id = shared_name
         else:
           # When in eager mode use a uid for the shared_name, to prevent
           # accidental sharing.
-          shared_name = "%s_%d" % (handle_name, ops.uid())
+          unique_id = "%s_%d" % (handle_name, ops.uid())
+          shared_name = context.shared_name()
         # Use attr_scope and device(None) to simulate the behavior of
         # colocate_with when the variable we want to colocate with doesn't
         # yet exist.
@@ -425,7 +437,7 @@ class ResourceVariable(variables.RefVariable):
               "variable inside a loop or conditional, use a lambda as the "
               "initializer." % name)
         # pylint: enable=protected-access
-        self._unique_id = shared_name
+        self._unique_id = unique_id
         self._initial_value = initial_value if self._in_graph_mode else None
         self._handle_name = handle_name + ":0"
         self._dtype = initial_value.dtype.base_dtype
@@ -437,12 +449,15 @@ class ResourceVariable(variables.RefVariable):
                 gen_resource_variable_ops.var_is_initialized_op(self._handle))
           if initial_value is not None:
             with ops.name_scope("Assign") as n, ops.colocate_with(self._handle):
+              # pylint: disable=protected-access
               self._initializer_op = (
                   gen_resource_variable_ops.assign_variable_op(
                       self._handle,
-                      self._try_guard_against_uninitialized_dependencies(
+                      variables._try_guard_against_uninitialized_dependencies(
+                          name,
                           initial_value),
                       name=n))
+              # pylint: enable=protected-access
           with ops.name_scope("Read"), ops.colocate_with(self._handle):
             # Manually assign reads to the handle's device to avoid log
             # messages.
@@ -490,7 +505,6 @@ class ResourceVariable(variables.RefVariable):
       # all in graph mode.
       self._handle_deleter = EagerResourceDeleter(
           handle=self._handle, handle_device=self._handle.device)
-    self._cached_shape_as_list = None
 
   def _init_from_proto(self, variable_def, import_scope=None):
     """Initializes from `VariableDef` proto."""
@@ -548,7 +562,6 @@ class ResourceVariable(variables.RefVariable):
     self._caching_device = None
     self._dtype = dtypes.as_dtype(self._handle.op.get_attr("dtype"))
     self._constraint = None
-    self._cached_shape_as_list = None
 
   @contextlib.contextmanager
   def _assign_dependencies(self):
@@ -583,7 +596,8 @@ class ResourceVariable(variables.RefVariable):
         trainable=self._trainable,
         constraint=self._constraint,
         dtype=self._dtype,
-        name=self._shared_name + "_copy")
+        name=self._shared_name + "_copy",
+        distribute_strategy=self.distribute_strategy)
     memo[self._unique_id] = copied_variable
     return copied_variable
 
@@ -618,12 +632,9 @@ class ResourceVariable(variables.RefVariable):
     return self._distribute_strategy
 
   def _shape_as_list(self):
-    if self._cached_shape_as_list:
-      return self._cached_shape_as_list
     if self.shape.ndims is None:
       return None
-    self._cached_shape_as_list = [dim.value for dim in self.shape.dims]
-    return self._cached_shape_as_list
+    return [dim.value for dim in self.shape.dims]
 
   def _shape_tuple(self):
     shape = self._shape_as_list()
@@ -683,6 +694,10 @@ class ResourceVariable(variables.RefVariable):
     """The op for this variable."""
     return self._handle.op
 
+  @property
+  def trainable(self):
+    return self._trainable
+
   def eval(self, session=None):
     """Evaluates and returns the value of this variable."""
     if context.executing_eagerly():
@@ -818,10 +833,6 @@ class ResourceVariable(variables.RefVariable):
     return ResourceVariable(
         variable_def=variable_def, import_scope=import_scope)
 
-  def _ref(self):
-    """Unsupported."""
-    raise NotImplementedError("ResourceVariable does not implement _ref()")
-
   def set_shape(self, shape):
     """Unsupported."""
     raise NotImplementedError("ResourceVariable does not implement set_shape()")
@@ -929,7 +940,15 @@ class ResourceVariable(variables.RefVariable):
     return assign_op
 
   def __reduce__(self):
-    return (ResourceVariable, (self.numpy(),))
+    # The implementation mirrors that of __deepcopy__.
+    return functools.partial(
+        ResourceVariable,
+        initial_value=self.numpy(),
+        trainable=self.trainable,
+        name=self._shared_name,
+        dtype=self.dtype,
+        constraint=self.constraint,
+        distribute_strategy=self.distribute_strategy), ()
 
   def scatter_sub(self, sparse_delta, use_locking=False, name=None):
     """Subtracts `IndexedSlices` from this variable.
@@ -994,6 +1013,55 @@ class ResourceVariable(variables.RefVariable):
         self.handle, sparse_delta.indices,
         ops.convert_to_tensor(sparse_delta.values, self.dtype), name=name))
 
+  def batch_scatter_update(self, sparse_delta, use_locking=False, name=None):
+    """Assigns `IndexedSlices` to this variable batch-wise.
+
+    Analogous to `batch_gather`. This assumes that this variable and the
+    sparse_delta IndexedSlices have a series of leading dimensions that are the
+    same for all of them, and the updates are performed on the last dimension of
+    indices. In other words, the dimensions should be the following:
+
+    `num_prefix_dims = sparse_delta.indices.ndims - 1`
+    `batch_dim = num_prefix_dims + 1`
+    `sparse_delta.updates.shape = sparse_delta.indices.shape + var.shape[
+         batch_dim:]`
+
+    where
+
+    `sparse_delta.updates.shape[:num_prefix_dims]`
+    `== sparse_delta.indices.shape[:num_prefix_dims]`
+    `== var.shape[:num_prefix_dims]`
+
+    And the operation performed can be expressed as:
+
+    `var[i_1, ..., i_n,
+         sparse_delta.indices[i_1, ..., i_n, j]] = sparse_delta.updates[
+            i_1, ..., i_n, j]`
+
+    When sparse_delta.indices is a 1D tensor, this operation is equivalent to
+    `scatter_update`.
+
+    To avoid this operation one can looping over the first `ndims` of the
+    variable and using `scatter_update` on the subtensors that result of slicing
+    the first dimension. This is a valid option for `ndims = 1`, but less
+    efficient than this implementation.
+
+    Args:
+      sparse_delta: `IndexedSlices` to be assigned to this variable.
+      use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      ValueError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    return self._lazy_read(state_ops.batch_scatter_update(
+        self, sparse_delta.indices, sparse_delta.values,
+        use_locking=use_locking, name=name))
+
   def scatter_nd_sub(self, indices, updates, name=None):
     """Applies sparse subtraction to individual values or slices in a Variable.
 
@@ -1178,8 +1246,10 @@ class ResourceVariable(variables.RefVariable):
 
   def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
     del name
-    if dtype is not None and dtype != self.dtype:
-      return NotImplemented
+    if dtype is not None and not dtype.is_compatible_with(self.dtype):
+      raise ValueError(
+          "Incompatible type conversion requested to type {!r} for variable "
+          "of type {!r}".format(dtype.name, self.dtype.name))
     if as_ref:
       return self.read_value().op.inputs[0]
     else:
@@ -1231,6 +1301,12 @@ def _dense_var_to_tensor(var, dtype=None, name=None, as_ref=False):
   return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
 
 
+# Register a conversion function which reads the value of the variable,
+# allowing instances of the class to be used as tensors.
+ops.register_tensor_conversion_function(ResourceVariable, _dense_var_to_tensor)
+ops.register_dense_tensor_like_type(ResourceVariable)
+
+
 class _UnreadVariable(ResourceVariable):
   """Represents a future for a read of a variable.
 
@@ -1282,16 +1358,12 @@ class _UnreadVariable(ResourceVariable):
       return gen_resource_variable_ops.read_variable_op(self._handle,
                                                         self._dtype)
 
-  def set_shape(self, shape):
-    self._shape = shape
-    self._cached_shape_as_list = None
-
   @property
   def op(self):
     """The op for this variable."""
     return self._parent_op
 
-ops.register_tensor_conversion_function(_UnreadVariable, _dense_var_to_tensor)
+
 ops.register_dense_tensor_like_type(_UnreadVariable)
 
 
@@ -1374,10 +1446,6 @@ class _MixedPrecisionVariable(ResourceVariable):
       else:
         return res
 
-  def set_shape(self, shape):
-    self._shape = shape
-    self._cached_shape_as_list = None
-
   @property
   def op(self):
     """The op for this variable."""
@@ -1390,29 +1458,15 @@ class _MixedPrecisionVariable(ResourceVariable):
 
   def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
     del name
-    dtype = dtype or self.read_dtype
-    if dtype != self.read_dtype or as_ref:
+    if (dtype is not None and
+        not dtype.is_compatible_with(self.read_dtype) or as_ref):
       return NotImplemented
-    else:
-      res = self.value()
-    return res
+    return self.value()
 
   def _should_act_as_resource_variable(self):
     """To pass resource_variable_ops.is_resource_variable check."""
     pass
 
-# Register a conversion function which reads the value of the variable,
-# allowing instances of the class to be used as tensors.
-
-# Note: registering for Variable after ResourceVariable because inheritance will
-# otherwise lead to the wrong behavior.
-ops.register_tensor_conversion_function(ResourceVariable, _dense_var_to_tensor)
-ops.register_tensor_conversion_function(
-    variables.Variable, variables.Variable._TensorConversionFunction)  # pylint: disable=protected-access
-
-# pylint: disable=protected-access
-ops.register_dense_tensor_like_type(ResourceVariable)
-
 
 @ops.RegisterGradient("ReadVariableOp")
 def _ReadGrad(_, grad):
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 097b485a115fb8153f77d0ad24c63b872fb2e8ca..a149d9873016e52164d072ee4cabd98167bfa3dd 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -806,8 +806,8 @@ def sparse_split(keyword_required=KeywordRequired(),
   Graphically the output tensors are:
 
       output_tensor[0] =
-      [    a ]
-      [b c   ]
+      [    a   ]
+      [b c     ]
 
       output_tensor[1] =
       [ d e  ]
@@ -1774,7 +1774,9 @@ def sparse_reset_shape(sp_input, new_shape=None):
     output_shape_tensor = math_ops.cast(output_shape_tensor, dtypes.int64)
     # For cases when shape is known during graph construction, this catches the
     # error before the sparse_tensor.SparseTensor catches it.
-    output_shape_tensor.get_shape()[0].merge_with(in_shape.get_shape()[0])
+    if output_shape_tensor.get_shape().rank is not None:
+      output_shape_tensor.get_shape().dims[0].merge_with(
+          in_shape.get_shape().dims[0])
 
     output_shape_tensor_const = tensor_util.constant_value(output_shape_tensor)
     # For cases where all shapes are known during graph construction
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index 71aaceee272f6e0acd8b8e860fb501eaed4bd61b..be21263f4cbdbdd4a38b0e849e1fec15ba033712 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -432,19 +432,19 @@ def scatter_nd_add(ref, indices, updates, use_locking=False, name=None):
   `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 
   ```
-  [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+  [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
   ```
 
   For example, say we want to add 4 scattered elements to a rank-1 tensor to
-  8 elements. In Python, that update would look like this:
+  8 elements. In Python, that addition would look like this:
 
   ```python
-      ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-      indices = tf.constant([[4], [3], [1] ,[7]])
-      updates = tf.constant([9, 10, 11, 12])
-      add = tf.scatter_nd_add(ref, indices, updates)
-      with tf.Session() as sess:
-        print sess.run(add)
+  ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+  indices = tf.constant([[4], [3], [1], [7]])
+  updates = tf.constant([9, 10, 11, 12])
+  add = tf.scatter_nd_add(ref, indices, updates)
+  with tf.Session() as sess:
+    print sess.run(add)
   ```
 
   The resulting update to ref would look like this:
@@ -464,9 +464,8 @@ def scatter_nd_add(ref, indices, updates, use_locking=False, name=None):
     updates: A `Tensor`. Must have the same type as `ref`.
       A tensor of updated values to add to ref.
     use_locking: An optional `bool`. Defaults to `False`.
-      An optional bool. Defaults to True. If True, the assignment will
-      be protected by a lock; otherwise the behavior is undefined,
-      but may exhibit less contention.
+      If True, the assignment will be protected by a lock;
+      otherwise the behavior is undefined, but may exhibit less contention.
     name: A name for the operation (optional).
 
   Returns:
@@ -550,19 +549,19 @@ def scatter_nd_sub(ref, indices, updates, use_locking=False, name=None):
   `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 
   ```
-  [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+  [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
   ```
 
   For example, say we want to subtract 4 scattered elements from a rank-1 tensor
-  to 8 elements. In Python, that update would look like this:
+  with 8 elements. In Python, that update would look like this:
 
   ```python
-      ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-      indices = tf.constant([[4], [3], [1] ,[7]])
-      updates = tf.constant([9, 10, 11, 12])
-      op = tf.scatter_nd_sub(ref, indices, updates)
-      with tf.Session() as sess:
-        print sess.run(op)
+  ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+  indices = tf.constant([[4], [3], [1] ,[7]])
+  updates = tf.constant([9, 10, 11, 12])
+  op = tf.scatter_nd_sub(ref, indices, updates)
+  with tf.Session() as sess:
+    print sess.run(op)
   ```
 
   The resulting update to ref would look like this:
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index 7c2d3be338766a4e25a817f824e06c665059bc01..4eaa16a22cb1951437bb60605e040023b4d9429d 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -387,8 +387,11 @@ class Template(checkpointable.CheckpointableBase):
     """Returns the variable scope name created by this Template."""
     if self._variable_scope:
       name = self._variable_scope.name
-      # To prevent partial matches on the scope_name, we add '/' at the end.
-      return name if name[-1] == "/" else name + "/"
+      if not name or name[-1] == "/":
+        return name
+      else:
+        # To prevent partial matches on the scope_name, we add '/' at the end.
+        return name + "/"
 
   @property
   def variables(self):
@@ -646,29 +649,6 @@ class EagerTemplate(Template):
         with self._template_store.as_default():
           return self._call_func(args, kwargs)
 
-  @property
-  def name(self):
-    """Returns the name given to this Template."""
-    return self._name
-
-  @property
-  def func(self):
-    """Returns the func given to this Template."""
-    return self._func
-
-  @property
-  def variable_scope(self):
-    """Returns the variable scope object created by this Template."""
-    return self._variable_scope
-
-  @property
-  def variable_scope_name(self):
-    """Returns the variable scope name created by this Template."""
-    if self._variable_scope:
-      name = self._variable_scope.name
-      # To prevent partial matches on the scope_name, we add '/' at the end.
-      return name if name[-1] == "/" else name + "/"
-
   @property
   def variables(self):
     """Returns the list of variables created by the Template."""
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index 37d5e6ae2ae8dd55e1da523331c13e69c21b3288..90a8b0af469b1be36340244c0dfdf43e013c75a2 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -114,6 +114,7 @@ class _GraphTensorArray(object):
 
     if clear_after_read is None:
       clear_after_read = True
+    self._dynamic_size = None
     dynamic_size = dynamic_size or False
 
     self._dtype = dtype
@@ -221,7 +222,9 @@ class _GraphTensorArray(object):
     """See TensorArray."""
     flow = array_ops.identity(self._flow)
     ta = TensorArray(
-        dtype=self._dtype, handle=self._handle, flow=flow,
+        dtype=self._dtype,
+        handle=self._handle,
+        flow=flow,
         infer_shape=self._infer_shape,
         colocate_with_first_write_call=self._colocate_with_first_write_call)
     ta._element_shape = self._element_shape
@@ -278,7 +281,9 @@ class _GraphTensorArray(object):
             flow_in=self._flow,
             name=name)
       ta = TensorArray(
-          dtype=self._dtype, handle=self._handle, flow=flow_out,
+          dtype=self._dtype,
+          handle=self._handle,
+          flow=flow_out,
           colocate_with_first_write_call=self._colocate_with_first_write_call)
       ta._infer_shape = self._infer_shape
       ta._element_shape = self._element_shape
@@ -349,7 +354,9 @@ class _GraphTensorArray(object):
             flow_in=self._flow,
             name=name)
       ta = TensorArray(
-          dtype=self._dtype, handle=self._handle, flow=flow_out,
+          dtype=self._dtype,
+          handle=self._handle,
+          flow=flow_out,
           colocate_with_first_write_call=self._colocate_with_first_write_call)
       ta._infer_shape = self._infer_shape
       ta._element_shape = self._element_shape
@@ -378,7 +385,9 @@ class _GraphTensorArray(object):
             flow_in=self._flow,
             name=name)
       ta = TensorArray(
-          dtype=self._dtype, handle=self._handle, flow=flow_out,
+          dtype=self._dtype,
+          handle=self._handle,
+          flow=flow_out,
           colocate_with_first_write_call=self._colocate_with_first_write_call)
       ta._infer_shape = self._infer_shape
       ta._element_shape = self._element_shape
@@ -448,7 +457,7 @@ class _GraphTensorArrayV2(object):
     del tensor_array_name
     del colocate_with_first_write_call
 
-    del dynamic_size  # TODO(b/117943489): Unused for now.
+    self._dynamic_size = dynamic_size
 
     if (flow is not None and
         (not isinstance(flow, ops.Tensor) or flow.dtype != dtypes.variant)):
@@ -525,10 +534,7 @@ class _GraphTensorArrayV2(object):
   def identity(self):
     """See TensorArray."""
     flow = array_ops.identity(self._flow)
-    ta = TensorArray(
-        dtype=self._dtype, flow=flow, infer_shape=self._infer_shape)
-    ta._element_shape = self._element_shape
-    return ta
+    return build_ta_with_new_flow(self, flow)
 
   def grad(self, source, flow=None, name=None):
     """Not supported."""
@@ -553,11 +559,12 @@ class _GraphTensorArrayV2(object):
       if self._infer_shape:
         self._merge_element_shape(value.shape)
       flow_out = list_ops.tensor_list_set_item(
-          input_handle=self._flow, index=index, item=value, name=name)
-      ta = TensorArray(dtype=self._dtype, handle=None, flow=flow_out)
-      ta._infer_shape = self._infer_shape
-      ta._element_shape = self._element_shape
-      return ta
+          input_handle=self._flow,
+          index=index,
+          item=value,
+          resize_if_index_out_of_bounds=self._dynamic_size,
+          name=name)
+      return build_ta_with_new_flow(self, flow_out)
 
   def stack(self, name=None):
     """See TensorArray."""
@@ -581,10 +588,16 @@ class _GraphTensorArrayV2(object):
 
   def concat(self, name=None):
     """See TensorArray."""
-    value = list_ops.tensor_list_concat(
-        input_handle=self._flow, element_dtype=self._dtype, name=name)
     if self._element_shape and self._element_shape[0].dims is not None:
-      value.set_shape([None] + self._element_shape[0].dims[1:])
+      element_shape = [None] + self._element_shape[0].dims[1:]
+    else:
+      element_shape = None
+
+    value = list_ops.tensor_list_concat(
+        input_handle=self._flow,
+        element_dtype=self._dtype,
+        element_shape=element_shape,
+        name=name)
     return value
 
   @tf_should_use.should_use_result
@@ -596,15 +609,7 @@ class _GraphTensorArrayV2(object):
         self._merge_element_shape(value.shape[1:])
       flow_out = list_ops.tensor_list_from_tensor(
           tensor=value, element_shape=value.shape[1:])
-      ta = TensorArray(
-          dtype=self._dtype,
-          handle=self.handle,
-          flow=flow_out,
-          colocate_with_first_write_call=self._colocate_with_first_write_call)
-      ta._infer_shape = self._infer_shape
-      ta._element_shape = self._element_shape
-      ta._colocate_with = self._colocate_with
-      return ta
+      return build_ta_with_new_flow(self, flow_out)
 
   @tf_should_use.should_use_result
   def scatter(self, indices, value, name=None):
@@ -616,15 +621,7 @@ class _GraphTensorArrayV2(object):
         self._merge_element_shape(value.shape[1:])
       flow_out = list_ops.tensor_list_scatter(
           tensor=value, indices=indices, element_shape=-1)
-      ta = TensorArray(
-          dtype=self._dtype,
-          handle=self.handle,
-          flow=flow_out,
-          colocate_with_first_write_call=self._colocate_with_first_write_call)
-      ta._infer_shape = self._infer_shape
-      ta._element_shape = self._element_shape
-      ta._colocate_with = self._colocate_with
-      return ta
+      return build_ta_with_new_flow(self, flow_out)
 
   @tf_should_use.should_use_result
   def split(self, value, lengths, name=None):
@@ -644,15 +641,7 @@ class _GraphTensorArrayV2(object):
           lengths=lengths_64,
           element_shape=self._element_shape[0] if self._element_shape else None,
           name=name)
-      ta = TensorArray(
-          dtype=self._dtype,
-          handle=self.handle,
-          flow=flow_out,
-          colocate_with_first_write_call=self._colocate_with_first_write_call)
-      ta._infer_shape = self._infer_shape
-      ta._element_shape = self._element_shape
-      ta._colocate_with = self._colocate_with
-      return ta
+      return build_ta_with_new_flow(self, flow_out)
 
   def size(self, name=None):
     """See TensorArray."""
@@ -858,7 +847,8 @@ class _EagerTensorArray(object):
     if self._tensor_array:
       for ix in range(len(self._tensor_array)):
         self._maybe_zero(ix)
-    return array_ops.stack(self._tensor_array, name=name)
+    return ops.convert_to_tensor(
+        self._tensor_array, name=name, dtype=self._dtype)
 
   def gather(self, indices, name=None):
     """See TensorArray."""
@@ -1042,6 +1032,10 @@ class TensorArray(object):
     """The reference to the TensorArray."""
     return self._implementation.handle
 
+  @property
+  def _dynamic_size(self):
+    return self._implementation._dynamic_size
+
   @property
   def _infer_shape(self):
     return self._implementation._infer_shape
@@ -1227,8 +1221,10 @@ class TensorArray(object):
 
 
 def build_ta_with_new_flow(old_ta, flow):
+  """Builds a TensorArray with a new `flow` tensor."""
   ta = TensorArray(
       dtype=old_ta.dtype,
+      dynamic_size=old_ta._dynamic_size,
       handle=old_ta.handle,
       flow=flow,
       infer_shape=old_ta._infer_shape,
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 64cdde3d854dba7d4d2cc2d90f644277d3365d39..35c00778ae5c99cb5688c9ff1fa97b26c72dc855 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -842,8 +842,11 @@ class _VariableStore(object):
         if isinstance(var, resource_variable_ops.ResourceVariable):
           raise ValueError(err_msg)
         tb = var.op.traceback[::-1]
-        # Throw away internal tf entries and only take a few lines.
-        tb = [x for x in tb if "tensorflow/python" not in x[0]][:3]
+        # Throw away internal tf entries and only take a few lines. In some
+        # cases the traceback can be longer (e.g. if someone uses factory
+        # functions to create variables) so we take more than needed in the
+        # default case.
+        tb = [x for x in tb if "tensorflow/python" not in x[0]][:5]
         raise ValueError("%s Originally defined at:\n\n%s" % (err_msg, "".join(
             traceback.format_list(tb))))
       found_var = self._vars[name]
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 657f64deaa54b0cb6b25d3726603c9d4438655a7..f9fc72a6da3f95ed9b695211c869c0a737dcb634 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -59,21 +59,6 @@ def _make_getter(captured_getter, captured_previous):
   return getter
 
 
-def _has_cycle(op, path):
-  """Detect cycles in the dependencies of `initial_value`."""
-  if op.name in path:
-    return True
-  path.add(op.name)
-  for op_input in op.inputs:
-    if _has_cycle(op_input.op, path):
-      return True
-  for op_control_input in op.control_inputs:
-    if _has_cycle(op_control_input, path):
-      return True
-  path.remove(op.name)
-  return False
-
-
 @tf_export("VariableSynchronization")
 class VariableSynchronization(enum.Enum):
   """Indicates when a distributed variable will be synced.
@@ -321,6 +306,7 @@ class Variable(six.with_metaclass(VariableMetaclass,
 
   Here replacing adding `use_resource=True` when constructing the variable will
   fix any nondeterminism issues:
+
   ```
   v = tf.Variable(True, use_resource=True)
   tf.cond(v, lambda: v.assign(False), my_false_fn)
@@ -1041,6 +1027,10 @@ class Variable(six.with_metaclass(VariableMetaclass,
     """Alias of `Variable.shape`."""
     return self.shape
 
+  def _gather_saveables_for_checkpoint(self):
+    """For implementing `Checkpointable`. This object is saveable on its own."""
+    return {checkpointable.VARIABLE_VALUE_KEY: self}
+
   def to_proto(self, export_scope=None):
     """Converts a `Variable` to a `VariableDef` protocol buffer.
 
@@ -1144,6 +1134,9 @@ class Variable(six.with_metaclass(VariableMetaclass,
         return None
 
 
+Variable._OverloadAllOperators()  # pylint: disable=protected-access
+
+
 @tf_export(v1=["Variable"])
 class VariableV1(Variable):
   """See the [Variables Guide](https://tensorflow.org/guide/variables).
@@ -1582,7 +1575,8 @@ class RefVariable(VariableV1):
         # using their initialized_value() method.
         self._initializer_op = state_ops.assign(
             self._variable,
-            self._try_guard_against_uninitialized_dependencies(
+            _try_guard_against_uninitialized_dependencies(
+                name,
                 self._initial_value),
             validate_shape=validate_shape).op
 
@@ -2161,134 +2155,6 @@ class RefVariable(VariableV1):
     else:
       return v.value()
 
-  def _gather_saveables_for_checkpoint(self):
-    """For implementing `Checkpointable`. This object is saveable on its own."""
-    return {checkpointable.VARIABLE_VALUE_KEY: self}
-
-  def _try_guard_against_uninitialized_dependencies(self, initial_value):
-    """Attempt to guard against dependencies on uninitialized variables.
-
-    Replace references to variables in `initial_value` with references to the
-    variable's initialized values. The initialized values are essentially
-    conditional TensorFlow graphs that return a variable's value if it is
-    initialized or its `initial_value` if it hasn't been initialized. This
-    replacement is done on a best effort basis:
-
-    - If the `initial_value` graph contains cycles, we don't do any
-      replacements for that graph.
-    - If the variables that `initial_value` depends on are not present in the
-      `GLOBAL_VARIABLES` or `LOCAL_VARIABLES` we don't replace them.
-
-    In these cases, it is up to the caller to ensure that the `initial_value`
-    graph uses initialized variables or that they guard access to variables
-    using their `initialized_value` method.
-
-    Args:
-      initial_value: `Tensor`. The initial value.
-    Returns:
-      A `Tensor` suitable to initialize a variable.
-    Raises:
-      TypeError: If `initial_value` is not a `Tensor`.
-    """
-    if not isinstance(initial_value, ops.Tensor):
-      raise TypeError("initial_value needs to be a Tensor: %s" % initial_value)
-
-    # Don't modify initial_value if it contains any cyclic dependencies.
-    if _has_cycle(initial_value.op, path=set()):
-      return initial_value
-
-    return self._safe_initial_value_from_tensor(initial_value, op_cache={})
-
-  def _safe_initial_value_from_tensor(self, tensor, op_cache):
-    """Replace dependencies on variables with their initialized values.
-
-    Args:
-      tensor: A `Tensor`. The tensor to replace.
-      op_cache: A dict mapping operation names to `Operation`s. Used to memoize
-        the results so as to avoid creating redundant operations.
-    Returns:
-      A `Tensor` compatible with `tensor`. Any inputs that lead to variable
-      values will be replaced with a corresponding graph that uses the
-      variable's initialized values. This is done on a best-effort basis. If no
-      modifications need to be made then `tensor` will be returned unchanged.
-    """
-    op = tensor.op
-    new_op = op_cache.get(op.name)
-    if new_op is None:
-      new_op = self._safe_initial_value_from_op(op, op_cache)
-      op_cache[op.name] = new_op
-    return new_op.outputs[tensor.value_index]
-
-  def _safe_initial_value_from_op(self, op, op_cache):
-    """Replace dependencies on variables with their initialized values.
-
-    Args:
-      op: An `Operation`. The operation to replace.
-      op_cache: A dict mapping operation names to `Operation`s. Used to memoize
-        the results so as to avoid creating redundant operations.
-    Returns:
-      An `Operation` compatible with `op`. Any inputs that lead to variable
-      values will be replaced with a corresponding graph that uses the
-      variable's initialized values. This is done on a best-effort basis. If no
-      modifications need to be made then `op` will be returned unchanged.
-    """
-    op_type = op.node_def.op
-    if op_type in ("IsVariableInitialized", "VarIsInitializedOp",
-                   "ReadVariableOp"):
-      return op
-
-    # Attempt to find the initialized_value of any variable reference / handles.
-    # TODO(b/70206927): Fix handling of ResourceVariables.
-    if op_type in ("Variable", "VariableV2", "VarHandleOp"):
-      initialized_value = self._find_initialized_value_for_variable(op)
-      return op if initialized_value is None else initialized_value.op
-
-    # Recursively build initializer expressions for inputs.
-    modified = False
-    new_op_inputs = []
-    for op_input in op.inputs:
-      new_op_input = self._safe_initial_value_from_tensor(op_input, op_cache)
-      new_op_inputs.append(new_op_input)
-      modified = modified or (new_op_input != op_input)
-
-    # If at least one input was modified, replace the op.
-    if modified:
-      new_op_type = op_type
-      if new_op_type == "RefSwitch":
-        new_op_type = "Switch"
-      new_op_name = op.node_def.name + "_" + self.name
-      new_op_name = new_op_name.replace(":", "_")
-      return self.graph.create_op(
-          new_op_type, new_op_inputs,
-          op._output_types,  # pylint: disable=protected-access
-          name=new_op_name, attrs=op.node_def.attr)
-
-    return op
-
-  def _find_initialized_value_for_variable(self, variable_op):
-    """Find the initialized value for a variable op.
-
-    To do so, lookup the variable op in the variables collection.
-
-    Args:
-      variable_op: A variable `Operation`.
-    Returns:
-      A `Tensor` representing the initialized value for the variable or `None`
-      if the initialized value could not be found.
-    """
-    try:
-      var_names = [variable_op.node_def.name, variable_op.node_def.name + ":0"]
-      for collection_name in (ops.GraphKeys.GLOBAL_VARIABLES,
-                              ops.GraphKeys.LOCAL_VARIABLES):
-        for var in self.graph.get_collection(collection_name):
-          if var.name in var_names:
-            return var.initialized_value()
-    except AttributeError:
-      # Return None when an incomplete user-defined variable type was put in
-      # the collection.
-      return None
-    return None
-
   # NOTE(mrry): This enables the Variable's overloaded "right" binary
   # operators to run when the left operand is an ndarray, because it
   # accords the Variable class higher priority than an ndarray, or a
@@ -2441,6 +2307,151 @@ class RefVariable(VariableV1):
     return self._save_slice_info
 
 
+def _try_guard_against_uninitialized_dependencies(name, initial_value):
+  """Attempt to guard against dependencies on uninitialized variables.
+
+  Replace references to variables in `initial_value` with references to the
+  variable's initialized values. The initialized values are essentially
+  conditional TensorFlow graphs that return a variable's value if it is
+  initialized or its `initial_value` if it hasn't been initialized. This
+  replacement is done on a best effort basis:
+
+  - If the `initial_value` graph contains cycles, we don't do any
+    replacements for that graph.
+  - If the variables that `initial_value` depends on are not present in the
+    `GLOBAL_VARIABLES` or `LOCAL_VARIABLES` we don't replace them.
+
+  In these cases, it is up to the caller to ensure that the `initial_value`
+  graph uses initialized variables or that they guard access to variables
+  using their `initialized_value` method.
+
+  Args:
+    name: Variable name.
+    initial_value: `Tensor`. The initial value.
+  Returns:
+    A `Tensor` suitable to initialize a variable.
+  Raises:
+    TypeError: If `initial_value` is not a `Tensor`.
+  """
+  if not isinstance(initial_value, ops.Tensor):
+    raise TypeError("initial_value needs to be a Tensor: %s" % initial_value)
+
+  # Don't modify initial_value if it contains any cyclic dependencies.
+  if _has_cycle(initial_value.op, path=set()):
+    return initial_value
+  return _safe_initial_value_from_tensor(name, initial_value, op_cache={})
+
+
+def _has_cycle(op, path):
+  """Detect cycles in the dependencies of `initial_value`."""
+  if op.name in path:
+    return True
+  path.add(op.name)
+  for op_input in op.inputs:
+    if _has_cycle(op_input.op, path):
+      return True
+  for op_control_input in op.control_inputs:
+    if _has_cycle(op_control_input, path):
+      return True
+  path.remove(op.name)
+  return False
+
+
+def _safe_initial_value_from_tensor(name, tensor, op_cache):
+  """Replace dependencies on variables with their initialized values.
+
+  Args:
+    name: Variable name.
+    tensor: A `Tensor`. The tensor to replace.
+    op_cache: A dict mapping operation names to `Operation`s. Used to memoize
+      the results so as to avoid creating redundant operations.
+  Returns:
+    A `Tensor` compatible with `tensor`. Any inputs that lead to variable
+    values will be replaced with a corresponding graph that uses the
+    variable's initialized values. This is done on a best-effort basis. If no
+    modifications need to be made then `tensor` will be returned unchanged.
+  """
+  op = tensor.op
+  new_op = op_cache.get(op.name)
+  if new_op is None:
+    new_op = _safe_initial_value_from_op(name, op, op_cache)
+    op_cache[op.name] = new_op
+  return new_op.outputs[tensor.value_index]
+
+
+def _safe_initial_value_from_op(name, op, op_cache):
+  """Replace dependencies on variables with their initialized values.
+
+  Args:
+    name: Variable name.
+    op: An `Operation`. The operation to replace.
+    op_cache: A dict mapping operation names to `Operation`s. Used to memoize
+      the results so as to avoid creating redundant operations.
+  Returns:
+    An `Operation` compatible with `op`. Any inputs that lead to variable
+    values will be replaced with a corresponding graph that uses the
+    variable's initialized values. This is done on a best-effort basis. If no
+    modifications need to be made then `op` will be returned unchanged.
+  """
+  op_type = op.node_def.op
+  if op_type in ("IsVariableInitialized", "VarIsInitializedOp",
+                 "ReadVariableOp"):
+    return op
+
+  # Attempt to find the initialized_value of any variable reference / handles.
+  # TODO(b/70206927): Fix handling of ResourceVariables.
+  if op_type in ("Variable", "VariableV2", "VarHandleOp"):
+    initialized_value = _find_initialized_value_for_variable(op)
+    return op if initialized_value is None else initialized_value.op
+
+  # Recursively build initializer expressions for inputs.
+  modified = False
+  new_op_inputs = []
+  for op_input in op.inputs:
+    new_op_input = _safe_initial_value_from_tensor(name, op_input, op_cache)
+    new_op_inputs.append(new_op_input)
+    modified = modified or (new_op_input != op_input)
+
+  # If at least one input was modified, replace the op.
+  if modified:
+    new_op_type = op_type
+    if new_op_type == "RefSwitch":
+      new_op_type = "Switch"
+    new_op_name = op.node_def.name + "_" + name
+    new_op_name = new_op_name.replace(":", "_")
+    return op.graph.create_op(
+        new_op_type, new_op_inputs,
+        op._output_types,  # pylint: disable=protected-access
+        name=new_op_name, attrs=op.node_def.attr)
+
+  return op
+
+
+def _find_initialized_value_for_variable(variable_op):
+  """Find the initialized value for a variable op.
+
+  To do so, lookup the variable op in the variables collection.
+
+  Args:
+    variable_op: A variable `Operation`.
+  Returns:
+    A `Tensor` representing the initialized value for the variable or `None`
+    if the initialized value could not be found.
+  """
+  try:
+    var_names = [variable_op.node_def.name, variable_op.node_def.name + ":0"]
+    for collection_name in (ops.GraphKeys.GLOBAL_VARIABLES,
+                            ops.GraphKeys.LOCAL_VARIABLES):
+      for var in variable_op.graph.get_collection(collection_name):
+        if var.name in var_names:
+          return var.initialized_value()
+  except AttributeError:
+    # Return None when an incomplete user-defined variable type was put in
+    # the collection.
+    return None
+  return None
+
+
 class PartitionedVariable(object):
   """A container for partitioned `Variable` objects.
 
@@ -2659,6 +2670,15 @@ class PartitionedVariable(object):
       return assign_list
     return [assign.op for assign in assign_list]
 
+
+# Register a conversion function which reads the value of the variable,
+# allowing instances of the class to be used as tensors.
+ops.register_tensor_conversion_function(
+    RefVariable,
+    RefVariable._TensorConversionFunction)  # pylint: disable=protected-access
+ops.register_dense_tensor_like_type(RefVariable)
+
+
 @tf_export(v1=["global_variables"])
 def global_variables(scope=None):
   """Returns global variables.
@@ -2982,12 +3002,7 @@ def report_uninitialized_variables(var_list=None,
         # uninitialized variables.
         return array_ops.boolean_mask(variable_names_tensor, variables_mask)
 
-# pylint: disable=protected-access
-Variable._OverloadAllOperators()
 
 ops.register_tensor_conversion_function(
-    PartitionedVariable, PartitionedVariable._TensorConversionFunction)
-# pylint: enable=protected-access
-
-
-ops.register_dense_tensor_like_type(Variable)
+    PartitionedVariable,
+    PartitionedVariable._TensorConversionFunction)  # pylint: disable=protected-access
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index 295686f8143c6128b8fb20850178cfe7c2cb8377..f5a51bb1bcb029cfdb729630deb715646783c3d7 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -115,12 +115,15 @@ def while_loop(cond,
             loop_counter < maximum_iterations,
             cond(*_pack_sequence_as(orig_loop_vars, args)))
 
+    # NOTE(skyewm): we set collections to the outer graph's collections for
+    # compatibility with TPUEstimator.
     cond_graph = func_graph_module.func_graph_from_py_func(
         cond_name,
         wrapped_cond,
         loop_vars, {},
         signature=_build_signature(loop_vars, shape_invariants),
-        func_graph=util.WhileCondFuncGraph(cond_name),
+        func_graph=util.WhileCondFuncGraph(
+            cond_name, collections=ops.get_default_graph()._collections),  # pylint: disable=protected-access
         add_control_dependencies=add_control_dependencies)
 
     # Add external_captures of cond to the list of loop vars.
@@ -171,7 +174,8 @@ def while_loop(cond,
         wrapped_body,
         loop_vars, {},
         signature=_build_signature(loop_vars, shape_invariants),
-        func_graph=util.WhileBodyFuncGraph(body_name),
+        func_graph=util.WhileBodyFuncGraph(
+            body_name, collections=ops.get_default_graph()._collections),  # pylint: disable=protected-access
         add_control_dependencies=add_control_dependencies)
     # Add external captures of body to the list of loop vars.
     # Note that external tensors will be treated as loop invariants, i.e.,
@@ -254,6 +258,7 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
   maximum_iterations = op.get_attr(
       "_maximum_iterations") if _is_in_xla_context() else None
   assert not _is_in_xla_context() or maximum_iterations is not None
+  maximum_iterations = _validate_and_convert_to_tensor(maximum_iterations)
 
   # Set the incoming gradient of non-trainable inputs to None. It is possible
   # that we receive non-None gradients for non-trainable types in nested while
@@ -376,28 +381,30 @@ def _validate_and_convert_to_tensor(maximum_iterations):
   Raises:
     ValueError: If `maximum_iterations` is invalid.
   """
-  if _is_in_xla_context():
-    if maximum_iterations is None:
-      raise ValueError("maximum_iterations is None. It is required and must "
-                       "be statically known (e.g. a constant value or known "
-                       "shape dimension) when building while_loop in XLA "
-                       "context.")
-    if isinstance(maximum_iterations, ops.Tensor):
-      # Get the constant value from the `maximum_iterations` tensor to avoid
-      # capturing a Const tensor from outside this graph.
-      maximum_iterations = tensor_util.constant_value(maximum_iterations)
-      if maximum_iterations is None:
-        raise ValueError("maximum_iterations must be statically known (e.g. a "
-                         "constant value or known shape dimension) when "
-                         "building while_loop in XLA context.")
-
-  if maximum_iterations is not None:
-    # EmptyTensorList expects `max_num_elements` to be of type int32.
-    maximum_iterations = ops.convert_to_tensor(
-        maximum_iterations, dtype=dtypes.int32, name="maximum_iterations")
-    if maximum_iterations.shape.ndims != 0:
-      raise ValueError("maximum_iterations must be a scalar, saw shape: %s" %
-                       maximum_iterations.shape)
+  if maximum_iterations is None:
+    return None
+
+  if _is_in_xla_context() and isinstance(maximum_iterations, ops.Tensor):
+    # Get the constant value from the `maximum_iterations` tensor to avoid
+    # capturing a Const tensor from outside this graph.
+    value = tensor_util.constant_value(maximum_iterations)
+    if value is None:
+      # XLA requires maximum_iterations to be statically known (e.g. a
+      # constant value or known shape dimension) when intermediate values
+      # from the forward pass are needed in the gradients pass. However,
+      # maximum_iterations may not be required if the gradient isn't built
+      # or no intermediates are required, thus we return the tensor as is.
+      return maximum_iterations
+
+    maximum_iterations = value
+
+  # EmptyTensorList expects `max_num_elements` to be of type int32.
+  maximum_iterations = ops.convert_to_tensor(
+      maximum_iterations, dtype=dtypes.int32, name="maximum_iterations")
+  if maximum_iterations.shape.ndims != 0:
+    raise ValueError("maximum_iterations must be a scalar, saw shape: %s" %
+                     maximum_iterations.shape)
+
   return maximum_iterations
 
 
@@ -815,7 +822,7 @@ def _copy_handle_data(src_tensors, tgt_tensors):
 
 
 def _maybe_set_maximum_iterations_attr(op, maximum_iterations):
-  if control_flow_util.IsInXLAContext(op):
+  if maximum_iterations is not None and control_flow_util.IsInXLAContext(op):
     # Store the maximum_iterations to use in the gradient pass.
     op._set_attr(  # pylint: disable=protected-access
         "_maximum_iterations",
@@ -846,19 +853,8 @@ def _pack_sequence_as(structure_with_tas, loop_vars):
   """Like `nest.pack_sequence_as` but also replaces flows with TensorArrays."""
 
   def flow_to_tensor_array(flow, ta):  # pylint: disable=missing-docstring
-    if isinstance(ta, tensor_array_ops.TensorArray):
-      # pylint: disable=protected-access
-      new_ta = tensor_array_ops.TensorArray(
-          dtype=ta.dtype,
-          handle=ta.handle,
-          flow=flow,
-          infer_shape=ta._infer_shape,
-          colocate_with_first_write_call=ta._colocate_with_first_write_call)
-      new_ta._colocate_with = ta._colocate_with
-      new_ta._element_shape = ta._element_shape
-      # pylint: enable=protected-access
-      return new_ta
-    return flow
+    return (tensor_array_ops.build_ta_with_new_flow(ta, flow) if isinstance(  # pylint: disable=g-long-ternary
+        ta, tensor_array_ops.TensorArray) else flow)
 
   flattened_loop_vars = [
       flow_to_tensor_array(*z)
diff --git a/tensorflow/python/platform/gfile.py b/tensorflow/python/platform/gfile.py
index d0159e9e9816ba730c843d2b46936b142d47ff79..dd2c615e9e0ca193b68c4242cb64163bc9266762 100644
--- a/tensorflow/python/platform/gfile.py
+++ b/tensorflow/python/platform/gfile.py
@@ -37,7 +37,7 @@ from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export(v1=['gfile.GFile', 'gfile.Open'], v2=['io.gfile.GFile'])
+@tf_export('io.gfile.GFile', v1=['gfile.GFile', 'gfile.Open', 'io.gfile.GFile'])
 class GFile(_FileIO):
   """File I/O wrappers without thread locking.
 
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index 733d471ca29729ba07fca45bb20d5db04ae4cef9..0620e0345cb8ff42d2fa819d004a10fe88e29352 100755
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -32,6 +32,9 @@ limitations under the License.
 %rename("%s") TFE_ContextSetServerDef;
 %rename("%s") TFE_ContextAsyncWait;
 %rename("%s") TFE_ContextAsyncClearError;
+%rename("%s") TFE_NewProfiler;
+%rename("%s") TFE_DeleteProfiler;
+%rename("%s") TFE_ProfilerSerializeToString;
 %rename("%s") TFE_OpNameGetAttrType;
 %rename("%s") TFE_Py_InitEagerTensor;
 %rename("%s") TFE_Py_SetEagerTensorProfiler;
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 40d7e2f25ee4bd3b28301bf164255c67911d62d5..8da6ff514295be0a68a91d6ed727c15d19f7a3d6 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -11,7 +11,7 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
 
@@ -98,17 +98,16 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "loader_test",
     size = "small",
     srcs = ["loader_test.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:private"],
-    deps = [
+    additional_deps = [
         ":builder",
         ":loader",
         ":signature_def_utils",
         ":utils",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
@@ -118,7 +117,6 @@ py_test(
         "//tensorflow/python:state_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -155,15 +153,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "saved_model_test",
     size = "small",
     srcs = ["saved_model_test.py"],
-    data = ["//tensorflow/cc/saved_model:saved_model_half_plus_two"],
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    visibility = ["//visibility:private"],
-    deps = [
+    additional_deps = [
         ":builder",
         ":constants",
         ":loader",
@@ -186,6 +180,8 @@ py_test(
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
     ],
+    data = ["//tensorflow/cc/saved_model:saved_model_half_plus_two"],
+    tags = ["no_windows"],
 )
 
 py_library(
@@ -205,13 +201,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "utils_test",
     size = "small",
     srcs = ["utils_test.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:private"],
-    deps = [
+    additional_deps = [
         ":utils",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -237,13 +231,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "signature_def_utils_test",
     size = "small",
     srcs = ["signature_def_utils_test.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:private"],
-    deps = [
+    additional_deps = [
         ":signature_constants",
         ":signature_def_utils",
         ":utils",
@@ -254,12 +246,11 @@ py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "simple_save_test",
     size = "small",
     srcs = ["simple_save_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":loader",
         ":signature_constants",
         ":simple_save",
@@ -305,18 +296,17 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "save_test",
     srcs = ["save_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":loader",
         ":save",
         ":signature_constants",
         ":tag_constants",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -339,11 +329,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "load_test",
     srcs = ["load_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":load",
         ":save",
         "//tensorflow/python:constant_op",
@@ -408,10 +397,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "nested_structure_coder_test",
     srcs = ["nested_structure_coder_test.py"],
-    deps = [
+    additional_deps = [
         ":nested_structure_coder",
         ":struct_py",
         "//tensorflow/python:framework",
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
index 3e85fade50f4c7fd1bacb0a7d0c711ac01c88402..bbb148561277cd6b32ec050ac72809216b53b6e0 100644
--- a/tensorflow/python/saved_model/load.py
+++ b/tensorflow/python/saved_model/load.py
@@ -58,6 +58,11 @@ class _Loader(object):
           bound_inputs = [
               self._get_tensor_from_node(node_id)
               for node_id in monomorphic_function.bound_inputs]
+          bound_variables = [
+              self._nodes[node_id]
+              for node_id in monomorphic_function.bound_inputs
+              if self._proto.nodes[node_id].WhichOneof("kind") == "variable"
+          ]
           if name in seen_functions:
             if self._functions[name]._captured_inputs != bound_inputs:  # pylint: disable=protected-access
               raise NotImplementedError(
@@ -69,6 +74,7 @@ class _Loader(object):
             # concrete function, note that we did not modify the FuncGraph
             # itself.
             self._functions[name]._captured_inputs = bound_inputs  # pylint: disable=protected-access
+            self._functions[name]._func_graph.variables = bound_variables  # pylint: disable=protected-access
 
   def _get_tensor_from_node(self, node_id):
     obj = self._nodes[node_id]
@@ -79,11 +85,17 @@ class _Loader(object):
     raise ValueError("Can't convert node %s to tensor" % (type(obj)))
 
   def _load_all(self):
+    """Load all saved objects and wire their properties."""
     self._nodes = [self._recreate(proto) for proto in self._proto.nodes]
     # After creating the objects, construct the edges between the objects.
     for obj, object_proto in zip(self._nodes, self._proto.nodes):
       for reference in object_proto.children:
         setattr(obj, reference.local_name, self._nodes[reference.node_id])
+        # Note: if an object has an attribute `__call__` add a class method
+        # that allows `obj()` syntax to work. This is done per-instance to
+        # allow `callable` to be used to find out if an object is callable.
+        if reference.local_name == "__call__":
+          setattr(type(obj), "__call__", _call_attribute)
 
   def _restore_checkpoint(self):
     variables_path = saved_model_utils.get_variables_path(self._export_dir)
@@ -107,8 +119,16 @@ class _Loader(object):
     return factory[kind]()
 
   def _recreate_user_object(self, proto):
+    """Instantiates a SavedUserObject."""
     del proto
-    return tracking.Checkpointable()
+
+    # Note: each user object has its own class. This allows to make each one
+    # individually callable by adding a `__call__` method to the classes of
+    # the objects instances that have a `__call__` property.
+    class _UserObject(tracking.Checkpointable):
+      pass
+
+    return _UserObject()
 
   def _recreate_asset(self, proto):
     filename = os.path.join(
@@ -123,7 +143,11 @@ class _Loader(object):
   def _recreate_variable(self, proto):
     # TODO(andresp): Can we use the checkpointed value as initializer?
     dummy_value = init_ops.Zeros(dtype=proto.dtype)(shape=proto.shape)
-    return variables.Variable(dummy_value)
+    return variables.Variable(dummy_value, trainable=proto.trainable)
+
+
+def _call_attribute(instance, *args, **kwargs):
+  return instance.__call__(*args, **kwargs)
 
 
 def _load_saved_object_graph_proto(filename):
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index 9eac3e655503fe87a8d4c0afea002890ce43be56..8b34414d253cd2020e462e0876663fb166e38bd8 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 import tempfile
 
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
@@ -28,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import variables
+from tensorflow.python.saved_model import function_serialization
 from tensorflow.python.saved_model import load
 from tensorflow.python.saved_model import save
 from tensorflow.python.training.checkpointable import tracking
@@ -42,9 +44,6 @@ class LoadTest(test.TestCase):
 
   def test_structure_import(self):
     root = tracking.Checkpointable()
-    root.f = def_function.function(
-        lambda x: 2. * x,
-        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
     root.dep_one = tracking.Checkpointable()
     root.dep_two = tracking.Checkpointable()
     root.dep_two.dep = tracking.Checkpointable()
@@ -52,19 +51,27 @@ class LoadTest(test.TestCase):
     imported = self.cycle(root)
     self.assertIs(imported.dep_three, imported.dep_two.dep)
     self.assertIsNot(imported.dep_one, imported.dep_two)
-    self.assertEqual(4., imported.f(constant_op.constant(2.)).numpy())
 
   def test_variables(self):
     root = tracking.Checkpointable()
-    root.v1 = variables.Variable(1.)
-    root.v2 = variables.Variable(2.)
-    root.f = def_function.function(
-        lambda x: root.v2 * x,
-        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    root.v1 = variables.Variable(1., trainable=True)
+    root.v2 = variables.Variable(2., trainable=False)
     imported = self.cycle(root)
     self.assertEquals(imported.v1.numpy(), 1.0)
+    self.assertTrue(imported.v1.trainable)
     self.assertEquals(imported.v2.numpy(), 2.0)
+    self.assertFalse(imported.v2.trainable)
+
+  def test_capture_variables(self):
+    root = tracking.Checkpointable()
+    root.weights = variables.Variable(2.)
+    root.f = def_function.function(
+        lambda x: root.weights * x,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    imported = self.cycle(root)
     self.assertEqual(4., imported.f(constant_op.constant(2.)).numpy())
+    imported.weights.assign(4.0)
+    self.assertEqual(8., imported.f(constant_op.constant(2.)).numpy())
 
   def _make_asset(self, contents):
     filename = tempfile.mktemp(prefix=self.get_temp_dir())
@@ -72,19 +79,16 @@ class LoadTest(test.TestCase):
       f.write(contents)
     return filename
 
-  def test_assets_import(self):
+  def test_assets(self):
     file1 = self._make_asset("contents 1")
     file2 = self._make_asset("contents 2")
 
     root = tracking.Checkpointable()
-    root.f = def_function.function(
-        lambda x: 2. * x,
-        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
     root.asset1 = tracking.TrackableAsset(file1)
     root.asset2 = tracking.TrackableAsset(file2)
 
     save_dir = os.path.join(self.get_temp_dir(), "save_dir")
-    save.save(root, save_dir)
+    save.save(root, save_dir, signatures={})
 
     file_io.delete_file(file1)
     file_io.delete_file(file2)
@@ -110,18 +114,12 @@ class LoadTest(test.TestCase):
     with open(imported_output, "r") as f:
       self.assertEquals("contents", f.read())
 
-  def test_assets_dedup(self):
+  def test_dedup_assets(self):
     vocab = self._make_asset("contents")
     root = tracking.Checkpointable()
-    root.f = def_function.function(
-        lambda x: 2. * x,
-        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
-
     root.asset1 = tracking.TrackableAsset(vocab)
     root.asset2 = tracking.TrackableAsset(vocab)
-
     imported = self.cycle(root)
-
     self.assertEqual(imported.asset1.asset_path.numpy(),
                      imported.asset2.asset_path.numpy())
 
@@ -154,7 +152,7 @@ class LoadTest(test.TestCase):
     imported = self.cycle(root)
     self.assertEqual(4., imported.f(constant_op.constant(2.0)).numpy())
 
-  def test_nested_func(self):
+  def test_nested_functions(self):
     f = def_function.function(
         lambda x: x*2.0,
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
@@ -254,6 +252,97 @@ class LoadTest(test.TestCase):
     self.cycle(m)
     self.assertEquals(4.0, m.f(constant_op.constant(2.0)).numpy())
 
+  def test_basic_backprop(self):
+    weight = variables.Variable(1., trainable=True)
+    bias = variables.Variable(0., trainable=True)
+    g = def_function.function(
+        lambda x: x*weight + bias,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+
+    root = tracking.Checkpointable()
+    root.weight = weight
+    root.bias = bias
+    root.g = g
+    imported = self.cycle(root)
+    with backprop.GradientTape(watch_accessed_variables=True) as t:
+      x = constant_op.constant([3.5])
+      loss = imported.g(x)
+      grad = t.gradient(loss, [imported.weight, imported.bias])
+      self.assertAllClose(grad, [3.5, 1.0])
+
+  def test_callable(self):
+    class M1(tracking.Checkpointable):
+
+      @def_function.function(
+          input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+      def __call__(self, x):
+        return x
+
+    root = tracking.Checkpointable()
+    root.m1 = M1()
+    root.m2 = tracking.Checkpointable()
+    root.m2.__call__ = def_function.function(
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])(
+            lambda x: x*3.0)
+    imported = self.cycle(root)
+    x = constant_op.constant(1.0)
+
+    self.assertTrue(callable(imported.m1))
+    self.assertAllEqual(root.m1(x), imported.m1(x))
+
+    # Note: `root.m2` was not callable since `__call__` attribute was set
+    # into the instance and not on the class. But after a serialization cycle
+    # that starts to work.
+    self.assertTrue(callable(imported.m2))
+    self.assertAllEqual(root.m2.__call__(x), imported.m2(x))
+
+    # Verify that user objects without `__call__` attribute are not callable.
+    self.assertFalse(callable(imported))
+
+  def test_chain_callable(self):
+    func = def_function.function(
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])(
+            lambda x: x*3.0)
+    root = tracking.Checkpointable()
+    root.__call__ = tracking.Checkpointable()
+    root.__call__.__call__ = tracking.Checkpointable()
+    root.__call__.__call__.__call__ = func
+
+    imported = self.cycle(root)
+    self.assertTrue(callable(imported))
+    x = constant_op.constant(1.0)
+    self.assertAllEqual(imported(x).numpy(), 3.0)
+
+  def test_soft_matching(self):
+
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec([None], dtypes.int32)])
+    def func(x):
+      return 2 * x
+
+    root = tracking.Checkpointable()
+    root.f = func
+
+    self.assertAllEqual([2], root.f(constant_op.constant([1])).numpy())
+    self.assertAllEqual([2, 4], root.f(constant_op.constant([1, 2])).numpy())
+
+    self.assertEqual(
+        1, len(function_serialization.list_all_concrete_functions(root.f)))
+
+    imported = self.cycle(root)
+
+    with self.assertRaises(AssertionError):
+      # We cannot call the function with a constant of shape ().
+      self.assertEqual(7, imported.f(constant_op.constant(2)).numpy())
+
+    # TODO(vbardiovsky): When classes are revived with input_signatures, we
+    # should also check that the calls below are not generating any more
+    # concrete functions.
+    self.assertAllEqual([2, 4, 6, 8],
+                        imported.f(constant_op.constant([1, 2, 3, 4])).numpy())
+    self.assertAllEqual([2, 4, 6],
+                        imported.f(constant_op.constant([1, 2, 3])).numpy())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/model_utils/export_test.py b/tensorflow/python/saved_model/model_utils/export_test.py
index 776bfff886aeba5d6fc08e14329be39ade8d6061..ef512150a259514fcc4c801eaa06a99441f1f7a2 100644
--- a/tensorflow/python/saved_model/model_utils/export_test.py
+++ b/tensorflow/python/saved_model/model_utils/export_test.py
@@ -22,7 +22,6 @@ import os
 import tempfile
 import time
 
-from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -52,110 +51,110 @@ ops.register_tensor_conversion_function(LabeledTensorMock,
 
 class ExportTest(test_util.TensorFlowTestCase):
 
+  @test_util.deprecated_graph_mode_only
   def test_build_all_signature_defs_without_receiver_alternatives(self):
-    with context.graph_mode():
-      receiver_tensor = array_ops.placeholder(dtypes.string)
-      output_1 = constant_op.constant([1.])
-      output_2 = constant_op.constant(["2"])
-      output_3 = constant_op.constant(["3"])
-      export_outputs = {
-          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-              export_output.RegressionOutput(value=output_1),
-          "head-2": export_output.ClassificationOutput(classes=output_2),
-          "head-3": export_output.PredictOutput(outputs={
-              "some_output_3": output_3
-          }),
-      }
-
-      signature_defs = export_utils.build_all_signature_defs(
-          receiver_tensor, export_outputs)
-
-      expected_signature_defs = {
-          "serving_default":
-              signature_def_utils.regression_signature_def(receiver_tensor,
-                                                           output_1),
-          "head-2":
-              signature_def_utils.classification_signature_def(receiver_tensor,
-                                                               output_2, None),
-          "head-3":
-              signature_def_utils.predict_signature_def({
-                  "input": receiver_tensor
-              }, {"some_output_3": output_3})
-      }
-
-      self.assertDictEqual(expected_signature_defs, signature_defs)
+    receiver_tensor = array_ops.placeholder(dtypes.string)
+    output_1 = constant_op.constant([1.])
+    output_2 = constant_op.constant(["2"])
+    output_3 = constant_op.constant(["3"])
+    export_outputs = {
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+            export_output.RegressionOutput(value=output_1),
+        "head-2": export_output.ClassificationOutput(classes=output_2),
+        "head-3": export_output.PredictOutput(outputs={
+            "some_output_3": output_3
+        }),
+    }
+
+    signature_defs = export_utils.build_all_signature_defs(
+        receiver_tensor, export_outputs)
 
+    expected_signature_defs = {
+        "serving_default":
+            signature_def_utils.regression_signature_def(receiver_tensor,
+                                                         output_1),
+        "head-2":
+            signature_def_utils.classification_signature_def(receiver_tensor,
+                                                             output_2, None),
+        "head-3":
+            signature_def_utils.predict_signature_def({
+                "input": receiver_tensor
+            }, {"some_output_3": output_3})
+    }
+
+    self.assertDictEqual(expected_signature_defs, signature_defs)
+
+  @test_util.deprecated_graph_mode_only
   def test_build_all_signature_defs_with_dict_alternatives(self):
-    with context.graph_mode():
-      receiver_tensor = array_ops.placeholder(dtypes.string)
-      receiver_tensors_alternative_1 = {
-          "foo": array_ops.placeholder(dtypes.int64),
-          "bar": array_ops.sparse_placeholder(dtypes.float32)}
-      receiver_tensors_alternatives = {"other": receiver_tensors_alternative_1}
-      output_1 = constant_op.constant([1.])
-      output_2 = constant_op.constant(["2"])
-      output_3 = constant_op.constant(["3"])
-      export_outputs = {
-          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-              export_output.RegressionOutput(value=output_1),
-          "head-2": export_output.ClassificationOutput(classes=output_2),
-          "head-3": export_output.PredictOutput(outputs={
-              "some_output_3": output_3
-          }),
-      }
-
-      signature_defs = export_utils.build_all_signature_defs(
-          receiver_tensor, export_outputs, receiver_tensors_alternatives)
-
-      expected_signature_defs = {
-          "serving_default":
-              signature_def_utils.regression_signature_def(
-                  receiver_tensor,
-                  output_1),
-          "head-2":
-              signature_def_utils.classification_signature_def(
-                  receiver_tensor,
-                  output_2, None),
-          "head-3":
-              signature_def_utils.predict_signature_def(
-                  {"input": receiver_tensor},
-                  {"some_output_3": output_3}),
-          "other:head-3":
-              signature_def_utils.predict_signature_def(
-                  receiver_tensors_alternative_1,
-                  {"some_output_3": output_3})
-
-          # Note that the alternatives 'other:serving_default' and
-          # 'other:head-2' are invalid, because regession and classification
-          # signatures must take a single string input.  Here we verify that
-          # these invalid signatures are not included in the export_utils.
-      }
-
-      self.assertDictEqual(expected_signature_defs, signature_defs)
+    receiver_tensor = array_ops.placeholder(dtypes.string)
+    receiver_tensors_alternative_1 = {
+        "foo": array_ops.placeholder(dtypes.int64),
+        "bar": array_ops.sparse_placeholder(dtypes.float32)}
+    receiver_tensors_alternatives = {"other": receiver_tensors_alternative_1}
+    output_1 = constant_op.constant([1.])
+    output_2 = constant_op.constant(["2"])
+    output_3 = constant_op.constant(["3"])
+    export_outputs = {
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+            export_output.RegressionOutput(value=output_1),
+        "head-2": export_output.ClassificationOutput(classes=output_2),
+        "head-3": export_output.PredictOutput(outputs={
+            "some_output_3": output_3
+        }),
+    }
 
+    signature_defs = export_utils.build_all_signature_defs(
+        receiver_tensor, export_outputs, receiver_tensors_alternatives)
+
+    expected_signature_defs = {
+        "serving_default":
+            signature_def_utils.regression_signature_def(
+                receiver_tensor,
+                output_1),
+        "head-2":
+            signature_def_utils.classification_signature_def(
+                receiver_tensor,
+                output_2, None),
+        "head-3":
+            signature_def_utils.predict_signature_def(
+                {"input": receiver_tensor},
+                {"some_output_3": output_3}),
+        "other:head-3":
+            signature_def_utils.predict_signature_def(
+                receiver_tensors_alternative_1,
+                {"some_output_3": output_3})
+
+        # Note that the alternatives 'other:serving_default' and
+        # 'other:head-2' are invalid, because regession and classification
+        # signatures must take a single string input.  Here we verify that
+        # these invalid signatures are not included in the export_utils.
+    }
+
+    self.assertDictEqual(expected_signature_defs, signature_defs)
+
+  @test_util.deprecated_graph_mode_only
   def test_build_all_signature_defs_with_single_alternatives(self):
-    with context.graph_mode():
-      receiver_tensor = array_ops.placeholder(dtypes.string)
-      receiver_tensors_alternative_1 = array_ops.placeholder(dtypes.int64)
-      receiver_tensors_alternative_2 = array_ops.sparse_placeholder(
-          dtypes.float32)
-      # Note we are passing single Tensors as values of
-      # receiver_tensors_alternatives, where normally that is a dict.
-      # In this case a dict will be created using the default receiver tensor
-      # name "input".
-      receiver_tensors_alternatives = {"other1": receiver_tensors_alternative_1,
-                                       "other2": receiver_tensors_alternative_2}
-      output_1 = constant_op.constant([1.])
-      output_2 = constant_op.constant(["2"])
-      output_3 = constant_op.constant(["3"])
-      export_outputs = {
-          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-              export_output.RegressionOutput(value=output_1),
-          "head-2": export_output.ClassificationOutput(classes=output_2),
-          "head-3": export_output.PredictOutput(outputs={
-              "some_output_3": output_3
-          }),
-      }
+    receiver_tensor = array_ops.placeholder(dtypes.string)
+    receiver_tensors_alternative_1 = array_ops.placeholder(dtypes.int64)
+    receiver_tensors_alternative_2 = array_ops.sparse_placeholder(
+        dtypes.float32)
+    # Note we are passing single Tensors as values of
+    # receiver_tensors_alternatives, where normally that is a dict.
+    # In this case a dict will be created using the default receiver tensor
+    # name "input".
+    receiver_tensors_alternatives = {"other1": receiver_tensors_alternative_1,
+                                     "other2": receiver_tensors_alternative_2}
+    output_1 = constant_op.constant([1.])
+    output_2 = constant_op.constant(["2"])
+    output_3 = constant_op.constant(["3"])
+    export_outputs = {
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+            export_output.RegressionOutput(value=output_1),
+        "head-2": export_output.ClassificationOutput(classes=output_2),
+        "head-3": export_output.PredictOutput(outputs={
+            "some_output_3": output_3
+        }),
+    }
 
     signature_defs = export_utils.build_all_signature_defs(
         receiver_tensor, export_outputs, receiver_tensors_alternatives)
@@ -222,35 +221,35 @@ class ExportTest(test_util.TensorFlowTestCase):
     self.assertTrue(int(time_1) < int(time_2))
     self.assertTrue(int(time_2) < int(time_3))
 
+  @test_util.deprecated_graph_mode_only
   def test_build_all_signature_defs_serving_only(self):
-    with context.graph_mode():
-      receiver_tensor = {"input": array_ops.placeholder(dtypes.string)}
-      output_1 = constant_op.constant([1.])
-      export_outputs = {
-          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-              export_output.PredictOutput(outputs=output_1),
-          "train": export_output.TrainOutput(loss=output_1),
-      }
-
-      signature_defs = export_utils.build_all_signature_defs(
-          receiver_tensor, export_outputs)
-
-      expected_signature_defs = {
-          "serving_default": signature_def_utils.predict_signature_def(
-              receiver_tensor, {"output": output_1})
-      }
-
-      self.assertDictEqual(expected_signature_defs, signature_defs)
-
-      signature_defs = export_utils.build_all_signature_defs(
-          receiver_tensor, export_outputs, serving_only=False)
-
-      expected_signature_defs.update({
-          "train": signature_def_utils.supervised_train_signature_def(
-              receiver_tensor, loss={"loss": output_1})
-      })
-
-      self.assertDictEqual(expected_signature_defs, signature_defs)
+    receiver_tensor = {"input": array_ops.placeholder(dtypes.string)}
+    output_1 = constant_op.constant([1.])
+    export_outputs = {
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+            export_output.PredictOutput(outputs=output_1),
+        "train": export_output.TrainOutput(loss=output_1),
+    }
+
+    signature_defs = export_utils.build_all_signature_defs(
+        receiver_tensor, export_outputs)
+
+    expected_signature_defs = {
+        "serving_default": signature_def_utils.predict_signature_def(
+            receiver_tensor, {"output": output_1})
+    }
+
+    self.assertDictEqual(expected_signature_defs, signature_defs)
+
+    signature_defs = export_utils.build_all_signature_defs(
+        receiver_tensor, export_outputs, serving_only=False)
+
+    expected_signature_defs.update({
+        "train": signature_def_utils.supervised_train_signature_def(
+            receiver_tensor, loss={"loss": output_1})
+    })
+
+    self.assertDictEqual(expected_signature_defs, signature_defs)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index a76b370565162218a39e5aaa6d92a586fc9cf747..9db6d03ed098d15282c3466c0fed83d0025c8f35 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -222,7 +222,7 @@ def _normalize_outputs(outputs, function_name, signature_key):
 
 
 def _tensor_dict_to_tensorinfo(tensor_dict):
-  return {key: utils_impl.build_tensor_info(value)
+  return {key: utils_impl.build_tensor_info_internal(value)
           for key, value in tensor_dict.items()}
 
 
@@ -610,6 +610,7 @@ def _write_object_proto(obj, proto, asset_file_def_index, node_ids):
     proto.asset.asset_file_def_index = asset_file_def_index[obj]
   elif resource_variable_ops.is_resource_variable(obj):
     proto.variable.SetInParent()
+    proto.variable.trainable = obj.trainable
     proto.variable.dtype = obj.dtype.as_datatype_enum
     proto.variable.shape.CopyFrom(obj.shape.as_proto())
   elif isinstance(obj, def_function.PolymorphicFunction):
diff --git a/tensorflow/python/saved_model/save_test.py b/tensorflow/python/saved_model/save_test.py
index 457349599840d830278459fda909e0b80ffaab56..533b954190acfcf6151242d97ce4f77b639c8c9d 100644
--- a/tensorflow/python/saved_model/save_test.py
+++ b/tensorflow/python/saved_model/save_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import lookup_ops
@@ -40,7 +41,7 @@ from tensorflow.python.saved_model import loader
 from tensorflow.python.saved_model import save
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
-from tensorflow.python.training import adam
+from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.training.checkpointable import tracking
 from tensorflow.python.training.checkpointable import util
 
@@ -49,7 +50,7 @@ class _ModelWithOptimizer(util.Checkpoint):
 
   def __init__(self):
     self.dense = core.Dense(1)
-    self.optimizer = adam.AdamOptimizer(0.01)
+    self.optimizer = adam.Adam(0.01)
 
   @def_function.function(
       input_signature=(tensor_spec.TensorSpec([None, 2], dtypes.float32),
@@ -306,6 +307,30 @@ class SaveTest(test.TestCase):
       self.assertNotIn("T", complex_node.attr)
       self.assertNotIn("Tout", complex_node.attr)
 
+  def test_subclassed_no_signature(self):
+
+    class Subclassed(training.Model):
+
+      def call(self, inputs):
+        return inputs * 2.
+
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    model = Subclassed()
+    with self.assertRaisesRegexp(
+        ValueError, "no @tf.function-decorated methods"):
+      save.save(model, save_dir)
+
+    traced_call = def_function.function(
+        model.call,
+        input_signature=(tensor_spec.TensorSpec(
+            (None, None),
+            dtype=dtypes.float32),))
+    save.save(model, save_dir, traced_call)
+    self.assertAllClose({"output_0": [[8., 10.], [10., 12.]]},
+                        _import_and_infer(
+                            save_dir,
+                            {"inputs": [[4., 5.], [5., 6.]]}))
+
 
 class AssetTests(test.TestCase):
 
@@ -376,7 +401,7 @@ class _ModelWithOptimizerUsingDefun(util.Checkpoint):
 
   def __init__(self):
     self.dense = core.Dense(1)
-    self.optimizer = adam.AdamOptimizer(0.01)
+    self.optimizer = adam.Adam(0.01)
 
   # Using defun due to control flow v2 cycles, b/121159261. def_function uses
   # conds to gate variable initialization and so triggers cond reference cycles,
diff --git a/tensorflow/python/saved_model/saved_object_graph.proto b/tensorflow/python/saved_model/saved_object_graph.proto
index f46927d6e8734efdff028acb36983200b2a5bd1a..1e2514b7f7242105b6acd809c992d716df0973b6 100644
--- a/tensorflow/python/saved_model/saved_object_graph.proto
+++ b/tensorflow/python/saved_model/saved_object_graph.proto
@@ -104,6 +104,7 @@ message SavedMonomorphicFunction {
 message SavedVariable {
   DataType dtype = 1;
   TensorShapeProto shape = 2;
+  bool trainable = 3;
 
-  // TODO(andresp): Add "trainable" and save_slice_info_def.
+  // TODO(andresp): Add save_slice_info_def?
 }
diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py
index 5caabe59fec1a0819629bd9ff16ad5be19f0890a..a82007fd545ca9e088411bcd5234477b8801e995 100644
--- a/tensorflow/python/saved_model/utils_impl.py
+++ b/tensorflow/python/saved_model/utils_impl.py
@@ -22,6 +22,7 @@ import os
 
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -53,7 +54,17 @@ def build_tensor_info(tensor):
 
   Returns:
     A TensorInfo protocol buffer constructed based on the supplied argument.
+
+  Raises:
+    RuntimeError: If eager execution is enabled.
   """
+  if context.executing_eagerly():
+    raise RuntimeError("build_tensor_info is not supported in Eager mode.")
+  return build_tensor_info_internal(tensor)
+
+
+def build_tensor_info_internal(tensor):
+  """Utility function to build TensorInfo proto from a Tensor."""
   tensor_info = meta_graph_pb2.TensorInfo(
       dtype=dtypes.as_dtype(tensor.dtype).as_datatype_enum,
       tensor_shape=tensor.get_shape().as_proto())
diff --git a/tensorflow/python/saved_model/utils_test.py b/tensorflow/python/saved_model/utils_test.py
index 2afe8abfd646f26f0562d7cc56b82c5781a586ef..1e12de91b8652328632010d716f75f551aaab2db 100644
--- a/tensorflow/python/saved_model/utils_test.py
+++ b/tensorflow/python/saved_model/utils_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import types_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -81,6 +82,12 @@ class UtilsTest(test.TestCase):
     self.assertEqual(42, x_tensor_info.tensor_shape.dim[0].size)
     self.assertEqual(69, x_tensor_info.tensor_shape.dim[1].size)
 
+  def testBuildTensorInfoEager(self):
+    x = constant_op.constant(1, name="x")
+    with context.eager_mode(), self.assertRaisesRegexp(
+        RuntimeError, "build_tensor_info is not supported in Eager mode"):
+      utils.build_tensor_info(x)
+
   @test_util.run_v1_only("b/120545219")
   def testGetTensorFromInfoDense(self):
     expected = array_ops.placeholder(dtypes.float32, 1, name="x")
diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index 0c13016712f316e113723c4c0c250ef636a3fcf0..a01feb3dde041de2ca33f5f4d9fea6a1b6869d41 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Tensor summaries for exporting information about a model.
+"""Operations for writing summary data, for use in analysis and visualization.
 
-See the [Summary](https://tensorflow.org/api_guides/python/summary) guide.
+See the [Summaries and
+TensorBoard](https://www.tensorflow.org/guide/summaries_and_tensorboard) guide.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/summary/summary_iterator.py b/tensorflow/python/summary/summary_iterator.py
index 321b11ffb73487405428340df94010ed8ddbfcd4..3675c235cfba1063bf2e338fd223dce6c540bec6 100644
--- a/tensorflow/python/summary/summary_iterator.py
+++ b/tensorflow/python/summary/summary_iterator.py
@@ -24,7 +24,7 @@ from tensorflow.python.lib.io import tf_record
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('train.summary_iterator')
+@tf_export(v1=['train.summary_iterator'])
 def summary_iterator(path):
   # pylint: disable=line-too-long
   """An iterator for reading `Event` protocol buffers from an event file.
diff --git a/tensorflow/python/summary/writer/writer.py b/tensorflow/python/summary/writer/writer.py
index 78217b503ffac90811c6ae8316bc0c0b907e7bf7..a66be4f833713d106deda15fef56f48ef4a321d3 100644
--- a/tensorflow/python/summary/writer/writer.py
+++ b/tensorflow/python/summary/writer/writer.py
@@ -279,7 +279,7 @@ class SummaryToEventTransformer(object):
     self.event_writer.add_event(event)
 
 
-@tf_export("summary.FileWriter")
+@tf_export(v1=["summary.FileWriter"])
 class FileWriter(SummaryToEventTransformer):
   """Writes `Summary` protocol buffers to event files.
 
diff --git a/tensorflow/python/summary/writer/writer_cache.py b/tensorflow/python/summary/writer/writer_cache.py
index 645fa28a37fb125b6b1224961251bc8879d5fe6d..c62a7ce1a3f6eb6cd223f70dabd478b2dba24394 100644
--- a/tensorflow/python/summary/writer/writer_cache.py
+++ b/tensorflow/python/summary/writer/writer_cache.py
@@ -25,7 +25,7 @@ from tensorflow.python.summary.writer.writer import FileWriter
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('summary.FileWriterCache')
+@tf_export(v1=['summary.FileWriterCache'])
 class FileWriterCache(object):
   """Cache for file writers.
 
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index 901d6bc335f3a10439e2f02d0db2b237a89fece0..f1a911eb489970cb6a594258e5fcf69e70f91fcd 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -38,7 +38,20 @@ py_library(
     name = "saved_model_utils",
     srcs = ["saved_model_utils.py"],
     srcs_version = "PY2AND3",
-    deps = ["//tensorflow/contrib/saved_model:reader"],
+)
+
+py_test(
+    name = "saved_model_utils_test",
+    size = "small",
+    srcs = ["saved_model_utils_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
+    visibility = ["//visibility:private"],
+    deps = [
+        ":saved_model_utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/saved_model",
+    ],
 )
 
 py_library(
@@ -250,7 +263,6 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = [
         ":saved_model_utils",
-        "//tensorflow/contrib/saved_model:saved_model_py",
         "//tensorflow/python",
         "//tensorflow/python/debug:local_cli_wrapper",
     ],
diff --git a/tensorflow/python/tools/api/generator/doc_srcs.py b/tensorflow/python/tools/api/generator/doc_srcs.py
index b567eead3d0c8c3023322f95402662408152ce45..28bf0e9d015e6f4b28e8cfbf0dbb5a3ccec66f11 100644
--- a/tensorflow/python/tools/api/generator/doc_srcs.py
+++ b/tensorflow/python/tools/api/generator/doc_srcs.py
@@ -61,6 +61,7 @@ _TENSORFLOW_DOC_SOURCES = {
     'signal': DocSource(docstring_module_name='ops.signal.signal'),
     'sparse': DocSource(docstring_module_name='ops.sparse_ops'),
     'strings': DocSource(docstring_module_name='ops.string_ops'),
+    'summary': DocSource(docstring_module_name='summary.summary'),
     'sysconfig': DocSource(docstring_module_name='platform.sysconfig'),
     'test': DocSource(docstring_module_name='platform.test'),
     'train': DocSource(docstring_module_name='training.training'),
diff --git a/tensorflow/python/tools/api/generator/output_init_files_test.py b/tensorflow/python/tools/api/generator/output_init_files_test.py
index ab154af9101e32ecacda276004b0e2c39ced0b83..7013f007e583b7d35dcb6f8bfdbea2fefdbb3101 100644
--- a/tensorflow/python/tools/api/generator/output_init_files_test.py
+++ b/tensorflow/python/tools/api/generator/output_init_files_test.py
@@ -45,7 +45,7 @@ def _get_modules(package, attr_name, constants_attr_name):
       API constant names.
 
   Returns:
-    Set of TensorFow API modules.
+    Set of TensorFlow API modules.
   """
   modules = set()
   # TODO(annarev): split up the logic in create_python_api.py so that
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index afc4e517cdd0a34171038cc0ae2d74ce30ecb6a9..cdef42e2bf8df4834677bb809194183332c6f279 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -30,9 +30,8 @@ import sys
 import warnings
 
 import numpy as np
-
 from six import integer_types
-from tensorflow.contrib.saved_model.python.saved_model import reader
+
 from tensorflow.core.example import example_pb2
 from tensorflow.core.framework import types_pb2
 from tensorflow.python.client import session
@@ -56,7 +55,7 @@ def _show_tag_sets(saved_model_dir):
   Args:
     saved_model_dir: Directory containing the SavedModel to inspect.
   """
-  tag_sets = reader.get_saved_model_tag_sets(saved_model_dir)
+  tag_sets = saved_model_utils.get_saved_model_tag_sets(saved_model_dir)
   print('The given SavedModel contains the following tag-sets:')
   for tag_set in sorted(tag_sets):
     print(', '.join(sorted(tag_set)))
@@ -190,7 +189,7 @@ def _show_all(saved_model_dir):
   Args:
     saved_model_dir: Directory containing the SavedModel to inspect.
   """
-  tag_sets = reader.get_saved_model_tag_sets(saved_model_dir)
+  tag_sets = saved_model_utils.get_saved_model_tag_sets(saved_model_dir)
   for tag_set in sorted(tag_sets):
     print("\nMetaGraphDef with tag-set: '%s' "
           "contains the following SignatureDefs:" % ', '.join(tag_set))
@@ -654,7 +653,7 @@ def scan(args):
     scan_meta_graph_def(
         saved_model_utils.get_meta_graph_def(args.dir, args.tag_set))
   else:
-    saved_model = reader.read_saved_model(args.dir)
+    saved_model = saved_model_utils.read_saved_model(args.dir)
     for meta_graph_def in saved_model.meta_graphs:
       scan_meta_graph_def(meta_graph_def)
 
diff --git a/tensorflow/python/tools/saved_model_utils.py b/tensorflow/python/tools/saved_model_utils.py
index c27d7a2658a096d1f5ce515dbc1f86423eb113de..17c4b8cb8319363a4a2d422a563ae1227d673366 100644
--- a/tensorflow/python/tools/saved_model_utils.py
+++ b/tensorflow/python/tools/saved_model_utils.py
@@ -18,7 +18,78 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.saved_model.python.saved_model import reader
+import os
+
+from google.protobuf import message
+from google.protobuf import text_format
+from tensorflow.core.protobuf import saved_model_pb2
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.saved_model import constants
+from tensorflow.python.util import compat
+
+
+def read_saved_model(saved_model_dir):
+  """Reads the savedmodel.pb or savedmodel.pbtxt file containing `SavedModel`.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel file.
+
+  Returns:
+    A `SavedModel` protocol buffer.
+
+  Raises:
+    IOError: If the file does not exist, or cannot be successfully parsed.
+  """
+  # Build the path to the SavedModel in pbtxt format.
+  path_to_pbtxt = os.path.join(
+      compat.as_bytes(saved_model_dir),
+      compat.as_bytes(constants.SAVED_MODEL_FILENAME_PBTXT))
+  # Build the path to the SavedModel in pb format.
+  path_to_pb = os.path.join(
+      compat.as_bytes(saved_model_dir),
+      compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
+
+  # Ensure that the SavedModel exists at either path.
+  if not file_io.file_exists(path_to_pbtxt) and not file_io.file_exists(
+      path_to_pb):
+    raise IOError("SavedModel file does not exist at: %s" % saved_model_dir)
+
+  # Parse the SavedModel protocol buffer.
+  saved_model = saved_model_pb2.SavedModel()
+  if file_io.file_exists(path_to_pb):
+    try:
+      file_content = file_io.FileIO(path_to_pb, "rb").read()
+      saved_model.ParseFromString(file_content)
+      return saved_model
+    except message.DecodeError as e:
+      raise IOError("Cannot parse file %s: %s." % (path_to_pb, str(e)))
+  elif file_io.file_exists(path_to_pbtxt):
+    try:
+      file_content = file_io.FileIO(path_to_pbtxt, "rb").read()
+      text_format.Merge(file_content.decode("utf-8"), saved_model)
+      return saved_model
+    except text_format.ParseError as e:
+      raise IOError("Cannot parse file %s: %s." % (path_to_pbtxt, str(e)))
+  else:
+    raise IOError("SavedModel file does not exist at: %s/{%s|%s}" %
+                  (saved_model_dir, constants.SAVED_MODEL_FILENAME_PBTXT,
+                   constants.SAVED_MODEL_FILENAME_PB))
+
+
+def get_saved_model_tag_sets(saved_model_dir):
+  """Retrieves all the tag-sets available in the SavedModel.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel.
+
+  Returns:
+    String representation of all tag-sets in the SavedModel.
+  """
+  saved_model = read_saved_model(saved_model_dir)
+  all_tags = []
+  for meta_graph_def in saved_model.meta_graphs:
+    all_tags.append(list(meta_graph_def.meta_info_def.tags))
+  return all_tags
 
 
 def get_meta_graph_def(saved_model_dir, tag_set):
@@ -39,7 +110,7 @@ def get_meta_graph_def(saved_model_dir, tag_set):
   Returns:
     A MetaGraphDef corresponding to the tag-set.
   """
-  saved_model = reader.read_saved_model(saved_model_dir)
+  saved_model = read_saved_model(saved_model_dir)
   set_of_tags = set(tag_set.split(','))
   for meta_graph_def in saved_model.meta_graphs:
     if set(meta_graph_def.meta_info_def.tags) == set_of_tags:
diff --git a/tensorflow/python/tools/saved_model_utils_test.py b/tensorflow/python/tools/saved_model_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5512dea1f74c8a27045c0036fb0d6df9681169bf
--- /dev/null
+++ b/tensorflow/python/tools/saved_model_utils_test.py
@@ -0,0 +1,116 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for SavedModel utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import builder as saved_model_builder
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.tools import saved_model_utils
+
+
+def tearDownModule():
+  file_io.delete_recursively(test.get_temp_dir())
+
+
+class SavedModelUtilTest(test.TestCase):
+
+  def _init_and_validate_variable(self, sess, variable_name, variable_value):
+    v = variables.Variable(variable_value, name=variable_name)
+    sess.run(variables.global_variables_initializer())
+    self.assertEqual(variable_value, v.eval())
+
+  @test_util.deprecated_graph_mode_only
+  def testReadSavedModelValid(self):
+    saved_model_dir = os.path.join(test.get_temp_dir(), "valid_saved_model")
+    builder = saved_model_builder.SavedModelBuilder(saved_model_dir)
+    with self.session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+      builder.add_meta_graph_and_variables(sess, [tag_constants.TRAINING])
+    builder.save()
+
+    actual_saved_model_pb = saved_model_utils.read_saved_model(saved_model_dir)
+    self.assertEqual(len(actual_saved_model_pb.meta_graphs), 1)
+    self.assertEqual(
+        len(actual_saved_model_pb.meta_graphs[0].meta_info_def.tags), 1)
+    self.assertEqual(actual_saved_model_pb.meta_graphs[0].meta_info_def.tags[0],
+                     tag_constants.TRAINING)
+
+  def testReadSavedModelInvalid(self):
+    saved_model_dir = os.path.join(test.get_temp_dir(), "invalid_saved_model")
+    with self.assertRaisesRegexp(
+        IOError, "SavedModel file does not exist at: %s" % saved_model_dir):
+      saved_model_utils.read_saved_model(saved_model_dir)
+
+  @test_util.deprecated_graph_mode_only
+  def testGetSavedModelTagSets(self):
+    saved_model_dir = os.path.join(test.get_temp_dir(), "test_tags")
+    builder = saved_model_builder.SavedModelBuilder(saved_model_dir)
+
+    # Graph with a single variable. SavedModel invoked to:
+    # - add with weights.
+    # - a single tag (from predefined constants).
+    with self.session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+      builder.add_meta_graph_and_variables(sess, [tag_constants.TRAINING])
+
+    # Graph that updates the single variable. SavedModel invoked to:
+    # - simply add the model (weights are not updated).
+    # - a single tag (from predefined constants).
+    with self.session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 43)
+      builder.add_meta_graph([tag_constants.SERVING])
+
+    # Graph that updates the single variable. SavedModel is invoked:
+    # - to add the model (weights are not updated).
+    # - multiple predefined tags.
+    with self.session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 44)
+      builder.add_meta_graph([tag_constants.SERVING, tag_constants.GPU])
+
+    # Graph that updates the single variable. SavedModel is invoked:
+    # - to add the model (weights are not updated).
+    # - multiple predefined tags for serving on TPU.
+    with self.session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 44)
+      builder.add_meta_graph([tag_constants.SERVING, tag_constants.TPU])
+
+    # Graph that updates the single variable. SavedModel is invoked:
+    # - to add the model (weights are not updated).
+    # - multiple custom tags.
+    with self.session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 45)
+      builder.add_meta_graph(["foo", "bar"])
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    actual_tags = saved_model_utils.get_saved_model_tag_sets(saved_model_dir)
+    expected_tags = [["train"], ["serve"], ["serve", "gpu"], ["serve", "tpu"],
+                     ["foo", "bar"]]
+    self.assertEqual(expected_tags, actual_tags)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 0c701f47122caf7ae561ddfa84b98925226930e0..b80fb03111d8257b34ae8f4d795fb9fded96ed00 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -39,7 +39,7 @@ class AdamOptimizer(optimizer.Optimizer):
 
   def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
                use_locking=False, name="Adam"):
-    """Construct a new Adam optimizer.
+    r"""Construct a new Adam optimizer.
 
     Initialization:
 
@@ -75,23 +75,20 @@ class AdamOptimizer(optimizer.Optimizer):
 
     Args:
       learning_rate: A Tensor or a floating point value.  The learning rate.
-      beta1: A float value or a constant float tensor.
-        The exponential decay rate for the 1st moment estimates.
-      beta2: A float value or a constant float tensor.
-        The exponential decay rate for the 2nd moment estimates.
+      beta1: A float value or a constant float tensor. The exponential decay
+        rate for the 1st moment estimates.
+      beta2: A float value or a constant float tensor. The exponential decay
+        rate for the 2nd moment estimates.
       epsilon: A small constant for numerical stability. This epsilon is
         "epsilon hat" in the Kingma and Ba paper (in the formula just before
         Section 2.1), not the epsilon in Algorithm 1 of the paper.
       use_locking: If True use locks for update operations.
       name: Optional name for the operations created when applying gradients.
-        Defaults to "Adam".
-
-    @compatibility(eager)
-    When eager execution is enabled, `learning_rate`, `beta1`, `beta2`, and
-    `epsilon` can each be a callable that takes no arguments and returns the
-    actual value to use. This can be useful for changing these values across
-    different invocations of optimizer functions.
-    @end_compatibility
+        Defaults to "Adam".  @compatibility(eager) When eager execution is
+        enabled, `learning_rate`, `beta1`, `beta2`, and `epsilon` can each be a
+        callable that takes no arguments and returns the actual value to use.
+        This can be useful for changing these values across different
+        invocations of optimizer functions. @end_compatibility
     """
     super(AdamOptimizer, self).__init__(use_locking, name)
     self._lr = learning_rate
diff --git a/tensorflow/python/training/checkpoint_management.py b/tensorflow/python/training/checkpoint_management.py
index a7ad1f70e5e86d2fcd86b76c54314238edd400e1..21fa6b3b5d3f8c306f0116f4d21940164c28b104 100644
--- a/tensorflow/python/training/checkpoint_management.py
+++ b/tensorflow/python/training/checkpoint_management.py
@@ -621,7 +621,8 @@ class CheckpointManager(object):
                >= self._last_preserved_timestamp)):
         self._last_preserved_timestamp = timestamp
         continue
-      remove_checkpoint(filename)
+      _delete_file_if_exists(filename + ".index")
+      _delete_file_if_exists(filename + ".data-?????-of-?????")
 
   def _record_state(self):
     """Saves the `CheckpointManager`'s state in `directory`."""
diff --git a/tensorflow/python/training/checkpointable/BUILD b/tensorflow/python/training/checkpointable/BUILD
index 595ce2a0da0299137d174a9bfff690476816c7df..855dc4fb68048fd3f523b1b8d6bca8edce66e599 100644
--- a/tensorflow/python/training/checkpointable/BUILD
+++ b/tensorflow/python/training/checkpointable/BUILD
@@ -11,7 +11,7 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_test")
 
 py_library(
@@ -32,11 +32,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "base_test",
     srcs = ["base_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":base",
         "//tensorflow/python:client_testlib",
     ],
@@ -52,11 +51,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "tracking_test",
     srcs = ["tracking_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":base",
         ":tracking",
         "//tensorflow/python:client_testlib",
@@ -79,11 +77,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "data_structures_test",
     srcs = ["data_structures_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":data_structures",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_test_lib",
@@ -129,15 +126,15 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "util_test",
     srcs = ["util_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],  # b/74395663
-    deps = [
+    additional_deps = [
         ":base",
         ":tracking",
         ":util",
+        "@absl_py//absl/testing:parameterized",
+        "@six_archive//:six",
         "//tensorflow/python:checkpoint_management",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
@@ -160,9 +157,8 @@ py_test(
         "//tensorflow/python/eager:test",
         "//tensorflow/python/keras:engine",
         "//tensorflow/python/keras:layers",
-        "@absl_py//absl/testing:parameterized",
-        "@six_archive//:six",
     ],
+    tags = ["notsan"],  # b/74395663
 )
 
 tf_xla_py_test(
@@ -188,15 +184,15 @@ tf_xla_py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "util_with_v1_optimizers_test",
     srcs = ["util_with_v1_optimizers_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],  # b/74395663
-    deps = [
+    additional_deps = [
         ":base",
         ":tracking",
         ":util",
+        "@absl_py//absl/testing:parameterized",
+        "@six_archive//:six",
         "//tensorflow/python:checkpoint_management",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
@@ -220,7 +216,6 @@ py_test(
         "//tensorflow/python/eager:test",
         "//tensorflow/python/keras:engine",
         "//tensorflow/python/keras:layers",
-        "@absl_py//absl/testing:parameterized",
-        "@six_archive//:six",
     ],
+    tags = ["notsan"],  # b/74395663
 )
diff --git a/tensorflow/python/training/checkpointable/util.py b/tensorflow/python/training/checkpointable/util.py
index c890a7f44084481d78e72bfde575d176f2ad8039..a45263f5c6b7b514703a38910c2c8aadc7be6b11 100644
--- a/tensorflow/python/training/checkpointable/util.py
+++ b/tensorflow/python/training/checkpointable/util.py
@@ -176,25 +176,13 @@ class _CheckpointRestoreCoordinator(object):
         raise AssertionError(
             ("Saveable keys changed when validating. Got back %s, was "
              "expecting %s") % (tensor_saveables.keys(), validated_names))
-      for saveable in validated_saveables:
-        if saveable.device:
-          device = saveable_object_util.set_cpu0(saveable.device)
-        else:
-          device = None
-        with ops.device(device):
-          tensors = []
-          for spec in saveable.specs:
-            tensors.append(
-                io_ops.restore_v2(
-                    self.save_path_tensor,
-                    [spec.name],
-                    [spec.slice_spec],
-                    [spec.dtype])[0])
-          restore_op = saveable.restore(tensors, restored_shapes=None)
-        if not context.executing_eagerly():
+      new_restore_ops = functional_saver.restore_from_saveable_objects(
+          self.save_path_tensor, validated_saveables)
+      if not context.executing_eagerly():
+        restore_ops.extend(new_restore_ops)
+        for saveable, restore_op in zip(validated_saveables, new_restore_ops):
           assert saveable.name not in self.restore_ops_by_name
           self.restore_ops_by_name[saveable.name] = restore_op
-          restore_ops.append(restore_op)
     return restore_ops
 
 
diff --git a/tensorflow/python/training/checkpointable/util_test.py b/tensorflow/python/training/checkpointable/util_test.py
index a5f4fec672ba95179a9afe8ed5cfac2311c3d265..4987387dc302368a9c3556b5a50a78b321d51812 100644
--- a/tensorflow/python/training/checkpointable/util_test.py
+++ b/tensorflow/python/training/checkpointable/util_test.py
@@ -1242,7 +1242,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     # Make sure initialization doesn't clobber later restores
     with test_util.device(use_gpu=True):
       model = MyModel()
-      optimizer = adam.Adam(0.001, beta1=1.0)
+      optimizer = adam.Adam(0.001, beta_1=1.0)
       root = checkpointable_utils.Checkpoint(
           optimizer=optimizer, model=model)
       opt_root = checkpointable_utils.Checkpoint(
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index 72670f0ca39f67b151abcb1813ede7ee36c6544b..4b267dfb988b3c9c84d15d0074f1da10b33ef90d 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -505,13 +505,13 @@ class ExponentialMovingAverage(object):
     ```
     Args:
       moving_avg_variables: a list of variables that require to use of the
-        moving variable name to be restored. If None, it will default to
+        moving average variable name to be restored. If None, it will default to
         variables.moving_average_variables() + variables.trainable_variables()
 
     Returns:
-      A map from restore_names to variables. The restore_name can be the
-      moving_average version of the variable name if it exist, or the original
-      variable name.
+      A map from restore_names to variables. The restore_name is either the
+      original or the moving average version of the variable name, depending
+      on whether the variable name is in the `moving_avg_variables`.
     """
     name_map = {}
     if moving_avg_variables is None:
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index eaa563e84aa76f6c27ed497c4e7c5db51cdb3fda..8076ed31bfcb4063b0e7417b82b7067b7cca41a0 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -521,8 +521,7 @@ class Optimizer(
   @staticmethod
   def _scale_loss(loss_value):
     if distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN:
-      num_replicas = \
-        distribute_ctx.get_distribution_strategy().num_replicas_in_sync
+      num_replicas = distribute_ctx.get_strategy().num_replicas_in_sync
       if num_replicas > 1:
         loss_value *= (1. / num_replicas)
     return loss_value
@@ -554,14 +553,15 @@ class Optimizer(
     # by most optimizers.  It relies on the subclass implementing the following
     # methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse().
 
-    # Handle DistributionStrategy case.
-    if distribute_ctx.get_cross_replica_context():
-      raise RuntimeError("Use `_distributed_apply()` instead of "
-                         "`apply_gradients()` in a cross-replica context.")
-    # TODO(isaprykin): Get rid of `has_distribution_strategy()` check by
+    # TODO(isaprykin): Get rid of `has_strategy()` check by
     # always calling _distributed_apply(), using the default distribution
     # as needed.
-    if distribute_ctx.has_distribution_strategy():
+    if distribute_ctx.has_strategy():
+      # Handle DistributionStrategy case.
+      if distribute_ctx.in_cross_replica_context():
+        raise RuntimeError("Use `_distributed_apply()` instead of "
+                           "`apply_gradients()` in a cross-replica context.")
+
       grads_and_vars = get_filtered_grad_fn(lambda: grads_and_vars)()
       return distribute_ctx.get_replica_context().merge_call(
           self._distributed_apply, args=(grads_and_vars, global_step, name))
@@ -815,7 +815,7 @@ class Optimizer(
     v = self._non_slot_dict.get(key, None)
     if v is None:
       self._maybe_initialize_checkpointable()
-      distribution_strategy = distribute_ctx.get_distribution_strategy()
+      distribution_strategy = distribute_ctx.get_strategy()
       with distribution_strategy.colocate_vars_with(colocate_with):
         if eager:
           restored_initial_value = self._preload_simple_restoration(
diff --git a/tensorflow/python/training/saving/functional_saver.py b/tensorflow/python/training/saving/functional_saver.py
index 51f618ddd32ce3a395829ac0b2eb36132b3d9bae..4ff2742c2f1b8b68528914c5c23414b1f87c957b 100644
--- a/tensorflow/python/training/saving/functional_saver.py
+++ b/tensorflow/python/training/saving/functional_saver.py
@@ -107,25 +107,32 @@ class Saver(object):
       A scalar string Tensor containing `file_prefix` with control dependencies
       on the restore ops.
     """
-    restore_specs = []
-    tensor_structure = []
-    for saveable in self._saveable_objects:
-      saveable_tensor_structure = []
-      tensor_structure.append(saveable_tensor_structure)
-      for spec in saveable.specs:
-        saveable_tensor_structure.append(spec.name)
-        restore_specs.append((spec.name, spec.slice_spec, spec.dtype))
-    tensor_names, tensor_slices, tensor_dtypes = zip(*restore_specs)
-    with ops.device("cpu:0"):
-      restored_tensors = io_ops.restore_v2(
-          file_prefix, tensor_names, tensor_slices, tensor_dtypes)
-    structured_restored_tensors = nest.pack_sequence_as(
-        tensor_structure, restored_tensors)
-    restore_ops = []
-    for saveable, restored_tensors in zip(self._saveable_objects,
-                                          structured_restored_tensors):
-      restore_ops.append(saveable.restore(restored_tensors,
-                                          restored_shapes=None))
+    restore_ops = restore_from_saveable_objects(
+        file_prefix, self._saveable_objects)
     with ops.device("cpu:0"):
       with ops.control_dependencies(restore_ops):
         return array_ops.identity(file_prefix)
+
+
+def restore_from_saveable_objects(file_prefix, saveable_objects):
+  """Reads from a checkpoint and returns restore ops for `saveable_objects`s."""
+  restore_specs = []
+  tensor_structure = []
+  for saveable in saveable_objects:
+    saveable_tensor_structure = []
+    tensor_structure.append(saveable_tensor_structure)
+    for spec in saveable.specs:
+      saveable_tensor_structure.append(spec.name)
+      restore_specs.append((spec.name, spec.slice_spec, spec.dtype))
+  tensor_names, tensor_slices, tensor_dtypes = zip(*restore_specs)
+  with ops.device("cpu:0"):
+    restored_tensors = io_ops.restore_v2(
+        file_prefix, tensor_names, tensor_slices, tensor_dtypes)
+  structured_restored_tensors = nest.pack_sequence_as(
+      tensor_structure, restored_tensors)
+  restore_ops = []
+  for saveable, restored_tensors in zip(saveable_objects,
+                                        structured_restored_tensors):
+    restore_ops.append(saveable.restore(restored_tensors,
+                                        restored_shapes=None))
+  return restore_ops
diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index 0f68fcfe8bb4cb81e54ba27d35bfb0b2e3888a1b..7bd0891e35dac1f04a5e98012253b39cca08478f 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -181,8 +182,16 @@ class SessionManager(object):
         set.
     """
     self._target = master
-    sess = session.Session(self._target, graph=self._graph, config=config)
 
+    # This is required to so that we initialize the TPU device before
+    # restoring from checkpoint since we'll be placing variables on the device
+    # and TPUInitialize wipes out the memory of the device.
+    strategy = distribution_strategy_context.get_strategy()
+    if strategy and hasattr(strategy.extended,
+                            "_experimental_initialize_system"):
+      strategy.extended._experimental_initialize_system()  # pylint: disable=protected-access
+
+    sess = session.Session(self._target, graph=self._graph, config=config)
     if checkpoint_dir and checkpoint_filename_with_path:
       raise ValueError("Can not provide both checkpoint_dir and "
                        "checkpoint_filename_with_path.")
diff --git a/tensorflow/python/training/slot_creator.py b/tensorflow/python/training/slot_creator.py
index bc1137e200dc0bfbc49c518dff63121ae3cd4f9e..abe1253b00dd78f63ce09c08ebc25dc717166410 100644
--- a/tensorflow/python/training/slot_creator.py
+++ b/tensorflow/python/training/slot_creator.py
@@ -121,8 +121,7 @@ def create_slot(primary, val, name, colocate_with_primary=True):
     prefix = primary.op.name
   with variable_scope.variable_scope(None, prefix + "/" + name):
     if colocate_with_primary:
-      distribution_strategy = (
-          distribution_strategy_context.get_distribution_strategy())
+      distribution_strategy = distribution_strategy_context.get_strategy()
       with distribution_strategy.colocate_vars_with(primary):
         return _create_slot_var(primary, val, "", validate_shape, None, None)
     else:
@@ -159,8 +158,7 @@ def create_slot_with_initializer(primary, initializer, shape, dtype, name,
     prefix = primary.op.name
   with variable_scope.variable_scope(None, prefix + "/" + name):
     if colocate_with_primary:
-      distribution_strategy = (
-          distribution_strategy_context.get_distribution_strategy())
+      distribution_strategy = distribution_strategy_context.get_strategy()
       with distribution_strategy.colocate_vars_with(primary):
         return _create_slot_var(primary, initializer, "", validate_shape, shape,
                                 dtype)
diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py
index cd4590db7f6550f8790ad683c9aaecf145ad12da..0079ecc98b0da3db38d1284258c7099f071cec9f 100644
--- a/tensorflow/python/training/sync_replicas_optimizer.py
+++ b/tensorflow/python/training/sync_replicas_optimizer.py
@@ -260,8 +260,7 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
     # local_anchor op will be placed on this worker task by default.
     local_anchor = control_flow_ops.no_op()
     # Colocating local_step variable prevents it being placed on the PS.
-    distribution_strategy = (
-        distribution_strategy_context.get_distribution_strategy())
+    distribution_strategy = distribution_strategy_context.get_strategy()
     with distribution_strategy.colocate_vars_with(local_anchor):
       self._local_step = variable_scope.variable(
           initial_value=0,
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index c43efc799ca5867c4f5b86023a94a4ef693d337b..7cf7e75a550513e4243d245236361e3a71a6639a 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -1,126 +1,663 @@
-licenses(["restricted"])
+# GPU executor library for data-parallel kernel launches and cross-platform
+# HPC-library APIs.
+#
+# Throughout this file, all targets are built with the standard crosstool and
+# do not link against restricted binary blobs.
 
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
-load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
+load("//tensorflow/stream_executor:build_defs.bzl", "stream_executor_friends")
+
+package_group(
+    name = "friends",
+    packages = stream_executor_friends(),
+)
+
+package(
+    default_visibility = [":friends"],
+)
+
+# Filegroup used to collect source files for the dependency check.
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+cc_library(
+    name = "launch_dim",
+    hdrs = [
+        "gpu_launch_dim.h",
+        "launch_dim.h",
+    ],
+    deps = [
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "device_description",
+    srcs = ["device_description.cc"],
+    hdrs = ["device_description.h"],
+    deps = [
+        ":launch_dim",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "event",
+    srcs = [
+        "blas.h",
+        "device_description.h",
+        "device_options.h",
+        "dnn.h",
+        "event.cc",
+        "fft.h",
+        "kernel_cache_config.h",
+        "launch_dim.h",
+        "plugin.h",
+        "plugin_registry.h",
+        "rng.h",
+        "shared_memory_config.h",
+        "stream_executor_pimpl.h",
+        "temporary_device_memory.h",
+        "temporary_memory_manager.h",
+        "trace_listener.h",
+    ],
+    hdrs = [
+        "device_memory.h",
+        "event.h",
+        "kernel.h",
+        "kernel_spec.h",
+        "platform.h",
+        "stream.h",
+        "stream_executor_internal.h",
+    ],
+    deps = [
+        ":dnn_proto_cc",
+        ":host_or_device_scalar",
+        ":stream_executor_headers",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "kernel",
+    srcs = [
+        "dnn.h",
+        "fft.h",
+        "kernel.cc",
+        "plugin.h",
+        "rng.h",
+        "stream.h",
+        "stream_executor_pimpl.h",
+        "temporary_device_memory.h",
+        "temporary_memory_manager.h",
+    ],
+    hdrs = [
+        "blas.h",
+        "device_description.h",
+        "device_options.h",
+        "event.h",
+        "kernel.h",
+        "kernel_spec.h",
+        "launch_dim.h",
+        "multi_platform_manager.h",
+        "platform.h",
+        "plugin_registry.h",
+        "shared_memory_config.h",
+        "stream_executor.h",
+        "stream_executor_internal.h",
+        "timer.h",
+        "trace_listener.h",
+    ],
+    deps = [
+        ":device_memory",
+        ":dnn_proto_cc",
+        ":host_or_device_scalar",
+        ":kernel_cache_config",
+        ":stream_executor_headers",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
 
-STREAM_EXECUTOR_HEADERS = glob([
-    "*.h",
-    "cuda/*.h",
-    "host/*.h",
-    "lib/*.h",
-    "lib/gtl/*.h",
-    "platform/**/*.h",
-])
+cc_library(
+    name = "kernel_spec",
+    srcs = ["kernel_spec.cc"],
+    hdrs = ["kernel_spec.h"],
+    deps = [
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "kernel_cache_config",
+    hdrs = ["kernel_cache_config.h"],
+)
+
+cc_library(
+    name = "module_spec",
+    hdrs = ["module_spec.h"],
+    deps = [
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "shared_memory_config",
+    hdrs = ["shared_memory_config.h"],
+)
+
+cc_library(
+    name = "stream_header",
+    hdrs = [
+        "blas.h",
+        "device_memory.h",
+        "dnn.h",
+        "event.h",
+        "fft.h",
+        "gpu_launch_dim.h",
+        "kernel.h",
+        "kernel_cache_config.h",
+        "launch_dim.h",
+        "stream.h",
+        "temporary_device_memory.h",
+        "temporary_memory_manager.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":dnn_proto_cc",
+        ":host_or_device_scalar",
+        ":stream_executor_headers",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+# It implements :stream_header
+cc_library(
+    name = "stream",
+    srcs = [
+        "host_buffer.h",
+        "stream.cc",
+    ],
+    hdrs = ["stream.h"],
+    deps = [
+        ":blas",
+        ":device_memory",
+        ":dnn",
+        ":event",
+        ":fft",
+        ":host_or_device_scalar",
+        ":kernel",
+        ":launch_dim",
+        ":platform",
+        ":rng",
+        ":stream_executor_headers",
+        ":stream_executor_internal",
+        ":stream_executor_pimpl",
+        ":temporary_memory_manager",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "timer",
+    srcs = [
+        "device_description.h",
+        "kernel_cache_config.h",
+        "timer.cc",
+    ],
+    hdrs = [
+        "blas.h",
+        "kernel.h",
+        "stream.h",
+        "stream_executor.h",
+        "timer.h",
+    ],
+    deps = [
+        ":host_or_device_scalar",
+        ":platform",
+        ":stream_executor_headers",
+        ":stream_executor_internal",
+        ":stream_executor_pimpl_header",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "platform",
+    srcs = ["platform.cc"],
+    hdrs = ["platform.h"],
+    deps = [
+        ":plugin",
+        ":stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "rng",
+    srcs = ["rng.cc"],
+    hdrs = ["rng.h"],
+    deps = ["//tensorflow/stream_executor/platform"],
+)
+
+cc_library(
+    name = "temporary_device_memory",
+    srcs = [
+        "event.h",
+        "temporary_device_memory.cc",
+        "temporary_memory_manager.h",
+    ],
+    hdrs = ["temporary_device_memory.h"],
+    deps = [
+        ":device_memory",
+        ":stream_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "temporary_memory_manager",
+    srcs = ["temporary_memory_manager.cc"],
+    hdrs = ["temporary_memory_manager.h"],
+    deps = [
+        ":device_memory",
+        ":stream_executor_pimpl_header",
+        ":stream_header",
+        ":temporary_device_memory",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "fft",
+    hdrs = ["fft.h"],
+    deps = [
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "blas",
+    srcs = ["blas.cc"],
+    hdrs = ["blas.h"],
+    deps = [
+        ":host_or_device_scalar",
+        ":stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "device_memory",
+    hdrs = ["device_memory.h"],
+    deps = ["//tensorflow/stream_executor/platform"],
+)
+
+cc_library(
+    name = "host_or_device_scalar",
+    hdrs = ["host_or_device_scalar.h"],
+    deps = [
+        ":device_memory",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "device_options",
+    hdrs = ["device_options.h"],
+    deps = [
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "executor_cache",
+    srcs = [
+        "device_description.h",
+        "device_memory.h",
+        "device_options.h",
+        "event.h",
+        "executor_cache.cc",
+        "launch_dim.h",
+        "plugin.h",
+        "plugin_registry.h",
+        "rng.h",
+        "stream_executor_pimpl.h",
+        "temporary_device_memory.h",
+        "temporary_memory_manager.h",
+    ],
+    hdrs = [
+        "blas.h",
+        "dnn.h",
+        "executor_cache.h",
+        "fft.h",
+        "kernel.h",
+        "kernel_cache_config.h",
+        "kernel_spec.h",
+        "platform.h",
+        "shared_memory_config.h",
+        "stream.h",
+        "stream_executor_internal.h",
+        "trace_listener.h",
+    ],
+    deps = [
+        ":dnn_proto_cc",
+        ":host_or_device_scalar",
+        ":stream_executor_headers",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "multi_platform_manager",
+    srcs = ["multi_platform_manager.cc"],
+    hdrs = ["multi_platform_manager.h"],
+    deps = [
+        ":platform",
+        ":stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_library(
+    name = "plugin",
+    srcs = ["plugin.cc"],
+    hdrs = ["plugin.h"],
+)
+
+cc_library(
+    name = "plugin_registry",
+    srcs = ["plugin_registry.cc"],
+    hdrs = ["plugin_registry.h"],
+    deps = [
+        ":blas",
+        ":dnn",
+        ":fft",
+        ":multi_platform_manager",
+        ":platform",
+        ":plugin",
+        ":stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
+
+cc_library(
+    name = "scratch_allocator",
+    srcs = ["scratch_allocator.cc"],
+    hdrs = ["scratch_allocator.h"],
+    deps = [
+        ":device_memory",
+        ":stream_header",
+        ":temporary_device_memory",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "host_buffer",
+    hdrs = ["host_buffer.h"],
+    deps = [":dnn"],
+)
 
 tf_proto_library(
     name = "dnn_proto",
     srcs = ["dnn.proto"],
     cc_api_version = 2,
     default_header = True,
-    protodeps = tf_additional_all_protos(),
+    provide_cc_alias = True,
 )
 
 tf_proto_library(
     name = "logging_proto",
     srcs = ["logging.proto"],
     cc_api_version = 2,
-    default_header = True,
-    protodeps = tf_additional_all_protos(),
+    protodeps = [":dnn_proto"],
+    provide_cc_alias = True,
+    visibility = [":friends"],
 )
 
 cc_library(
-    name = "stream_executor_impl",
-    srcs = glob(
-        [
-            "*.cc",
-            "host/*.cc",
-            "cuda/cuda_platform_id.cc",
-            "lib/*.cc",
-            "platform/default/*.cc",
-        ],
-        exclude = [
-            "**/*_test.cc",
-        ],
-    ),
-    hdrs = STREAM_EXECUTOR_HEADERS,
-    linkopts = select({
-        "//tensorflow:freebsd": [],
-        "//tensorflow:windows": [],
-        "//conditions:default": ["-ldl"],
-    }),
-    visibility = ["//visibility:public"],
+    name = "dnn",
+    srcs = ["dnn.cc"],
+    hdrs = ["dnn.h"],
     deps = [
-        ":dnn_proto_cc_impl",
-        ":logging_proto_cc_impl",
+        ":device_memory",
+        ":dnn_proto_cc",
+        ":stream_executor_headers",
         "//tensorflow/core:lib",
-        "//tensorflow/core:logger",
-        "//tensorflow/core:ptr_util",
-        "@com_google_absl//absl/container:flat_hash_map",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
-        "@local_config_cuda//cuda:cuda_headers",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
     ],
-    alwayslink = 1,
 )
 
 cc_library(
-    name = "stream_executor",
-    hdrs = STREAM_EXECUTOR_HEADERS,
+    name = "stream_executor_internal",
+    srcs = [
+        "dnn.h",
+        "stream_executor_internal.cc",
+    ],
+    hdrs = [
+        "shared_memory_config.h",
+        "stream_executor_internal.h",
+    ],
+    deps = [
+        ":device_description",
+        ":device_memory",
+        ":device_options",
+        ":dnn_proto_cc",
+        ":kernel",
+        ":kernel_cache_config",
+        ":kernel_spec",
+        ":launch_dim",
+        ":plugin_registry",
+        ":stream_executor_headers",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "stream_executor_pimpl_header",
+    hdrs = [
+        "device_description.h",
+        "dnn.h",
+        "kernel.h",
+        "kernel_cache_config.h",
+        "shared_memory_config.h",
+        "stream_executor_pimpl.h",
+    ],
     visibility = ["//visibility:public"],
     deps = [
         ":dnn_proto_cc",
-        ":logging_proto_cc",
+        ":platform",
+        ":stream_executor_headers",
+        ":stream_executor_internal",
         "//tensorflow/core:lib",
-        "//tensorflow/core:ptr_util",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
-        "@local_config_cuda//cuda:cuda_headers",
-    ] + if_static([":stream_executor_impl"]),
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
 )
 
-cc_header_only_library(
-    name = "stream_executor_headers_lib",
-    visibility = ["//visibility:public"],
+# It implements :stream_executor_pimpl_header
+cc_library(
+    name = "stream_executor_pimpl",
+    srcs = ["stream_executor_pimpl.cc"],
+    hdrs = ["stream_executor_pimpl.h"],
     deps = [
-        ":stream_executor",
+        ":blas",
+        ":executor_cache",
+        ":fft",
+        ":kernel",
+        ":platform",
+        ":rng",
+        ":stream_executor_headers",
+        ":stream_header",
+        ":timer",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
     ],
 )
 
+# The stream_executor_headers target does not prescribe an implementation.
+#
+# TODO(b/25131218) this is OBSOLETE/DEPRECATED -- get rid of this target altogether
 cc_library(
-    name = "cuda_platform",
-    srcs = if_cuda_is_configured(
-        glob(
-            [
-                "cuda/*.cc",
-            ],
-            exclude = [
-                "cuda/*_test.cc",
-                "cuda/cuda_platform_id.cc",
-            ],
-        ),
-    ),
-    copts = select({
-        "//tensorflow:windows": ["/DNOGDI"],
-        "//conditions:default": [],
-    }),
-    linkopts = select({
-        "//tensorflow:freebsd": [],
-        "//tensorflow:windows": [],
-        "//conditions:default": ["-ldl"],
-    }),
+    name = "stream_executor_headers",
+    hdrs = [
+        "blas.h",
+        "device_description.h",
+        "device_memory.h",
+        "device_options.h",
+        "dnn.h",
+        "event.h",
+        "executor_cache.h",
+        "fft.h",
+        "gpu_launch_dim.h",
+        "kernel.h",
+        "kernel_cache_config.h",
+        "kernel_spec.h",
+        "launch_dim.h",
+        "module_spec.h",
+        "multi_platform_manager.h",
+        "platform.h",
+        "plugin.h",
+        "plugin_registry.h",
+        "rng.h",
+        "shared_memory_config.h",
+        "stream.h",
+        "stream_executor.h",
+        "stream_executor_internal.h",
+        "stream_executor_pimpl.h",
+        "temporary_device_memory.h",
+        "temporary_memory_manager.h",
+        "timer.h",
+        "trace_listener.h",
+    ],
     visibility = ["//visibility:public"],
     deps = [
-        ":stream_executor",
+        ":dnn_proto_cc",
+        ":host_or_device_scalar",
         "//tensorflow/core:lib",
-        "//tensorflow/core/kernels:ops_util",
-        "@local_config_cuda//cuda:cuda_headers",
-    ] + if_cuda_is_configured([
-        "//tensorflow/core:cuda",
-        "@local_config_cuda//cuda:cuda_driver",
-        "@local_config_cuda//cuda:cudnn",
-    ]),
-    alwayslink = 1,
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "stream_executor",
+    hdrs = ["stream_executor.h"],
+    deps = [":stream_executor_headers"] + if_static([":stream_executor_impl"]),
+)
+
+cc_library(
+    name = "stream_executor_impl",
+    deps = [
+        ":device_description",
+        ":device_memory",
+        ":dnn_proto_cc",
+        ":dnn_proto_cc_impl",
+        ":event",
+        ":kernel",
+        ":launch_dim",
+        ":multi_platform_manager",
+        ":platform",
+        ":stream",
+        ":stream_executor_headers",
+        ":stream_executor_pimpl",
+        ":timer",
+    ],
+)
+
+tf_cc_test(
+    name = "stream_test",
+    size = "small",
+    srcs = ["stream_test.cc"],
+    deps = [
+        ":stream_executor",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/stream_executor/host:host_platform",
+    ],
+)
+
+alias(
+    name = "cuda_platform",
+    actual = "//tensorflow/stream_executor/cuda:all_runtime",
 )
diff --git a/tensorflow/stream_executor/build_defs.bzl b/tensorflow/stream_executor/build_defs.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..a7ddf5a0d7ab9849019339532c810ed19b46ca74
--- /dev/null
+++ b/tensorflow/stream_executor/build_defs.bzl
@@ -0,0 +1,11 @@
+def stream_executor_friends():
+    return ["//tensorflow/..."]
+
+def tf_additional_cuda_platform_deps():
+  return []
+
+def tf_additional_cuda_driver_deps():
+  return ["@local_config_cuda//cuda:cuda_driver"]
+
+def tf_additional_cudnn_plugin_deps():
+  return []
diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..87c8eae4164e8faa3538b75166f455218f78e010
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -0,0 +1,353 @@
+# Description:
+#   CUDA-platform specific StreamExecutor support code.
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load(
+    "//tensorflow/stream_executor:build_defs.bzl",
+    "stream_executor_friends",
+    "tf_additional_cuda_driver_deps",
+    "tf_additional_cuda_platform_deps",
+    "tf_additional_cudnn_plugin_deps",
+)
+load("//tensorflow:tensorflow.bzl", "tf_copts")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+
+package_group(
+    name = "friends",
+    packages = stream_executor_friends(),
+)
+
+package(
+    default_visibility = [":friends"],
+)
+
+# Filegroup used to collect source files for the dependency check.
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+cc_library(
+    name = "cuda_platform_id",
+    srcs = ["cuda_platform_id.cc"],
+    hdrs = ["cuda_platform_id.h"],
+    deps = ["//tensorflow/stream_executor:platform"],
+)
+
+cc_library(
+    name = "cuda_platform",
+    srcs = ["cuda_platform.cc"],
+    hdrs = ["cuda_platform.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cuda_driver",
+        ":cuda_gpu_executor",
+        ":cuda_platform_id",
+        "//tensorflow/stream_executor",  # buildcleaner: keep
+        "//tensorflow/stream_executor:executor_cache",
+        "//tensorflow/stream_executor:multi_platform_manager",
+        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ] + tf_additional_cuda_platform_deps(),
+    alwayslink = True,  # Registers itself with the MultiPlatformManager.
+)
+
+cc_library(
+    name = "cuda_diagnostics",
+    srcs = ["cuda_diagnostics.cc"],
+    hdrs = ["cuda_diagnostics.h"],
+    deps = [
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "cuda_driver",
+    srcs = ["cuda_driver.cc"],
+    hdrs = ["cuda_driver.h"],
+    deps = [
+        ":cuda_diagnostics",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/stream_executor:device_options",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ] + tf_additional_cuda_driver_deps(),
+)
+
+# The activation library is tightly coupled to the executor library.
+# TODO(leary) split up cuda_gpu_executor.cc so that this can stand alone.
+cc_library(
+    name = "cuda_activation_header",
+    hdrs = ["cuda_activation.h"],
+    visibility = ["//visibility:public"],
+    deps = ["//tensorflow/stream_executor/platform"],
+)
+
+cc_library(
+    name = "cuda_activation",
+    srcs = ["cuda_activation.cc"],
+    hdrs = ["cuda_activation.h"],
+    deps = [
+        ":cuda_driver",
+        "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor/platform",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
+cc_library(
+    name = "cuda_gpu_executor_header",
+    textual_hdrs = ["cuda_gpu_executor.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cuda_kernel",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "cublas_plugin",
+    srcs = ["cuda_blas.cc"],
+    hdrs = ["cuda_blas.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cuda_activation",
+        ":cuda_gpu_executor",
+        ":cuda_helpers",
+        ":cuda_platform_id",
+        ":cuda_stream",
+        ":cuda_timer",
+        "@com_google_absl//absl/strings",
+        "//third_party/eigen3",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:host_or_device_scalar",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor:timer",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ] + if_static(["@local_config_cuda//cuda:cublas"]),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "cufft_plugin",
+    srcs = ["cuda_fft.cc"],
+    hdrs = ["cuda_fft.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cuda_activation_header",
+        ":cuda_gpu_executor_header",
+        ":cuda_helpers",
+        ":cuda_platform_id",
+        ":cuda_stream",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:fft",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ] + if_static(["@local_config_cuda//cuda:cufft"]),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "cudnn_plugin",
+    srcs = ["cuda_dnn.cc"],
+    hdrs = ["cuda_dnn.h"],
+    copts = [
+        # STREAM_EXECUTOR_CUDNN_WRAP would fail on Clang with the default
+        # setting of template depth 256
+        "-ftemplate-depth-512",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cuda_activation",
+        ":cuda_diagnostics",
+        ":cuda_driver",
+        ":cuda_gpu_executor",
+        ":cuda_platform_id",
+        ":cuda_stream",
+        ":cuda_timer",
+        ":cudnn_version",
+        "@com_google_absl//absl/strings",
+        "//third_party/eigen3",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:logger",
+        "//tensorflow/stream_executor:dnn",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:logging_proto_cc",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor:temporary_device_memory",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ] + tf_additional_cudnn_plugin_deps() + if_static(["@local_config_cuda//cuda:cudnn"]),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "curand_plugin",
+    srcs = ["cuda_rng.cc"],
+    hdrs = ["cuda_rng.h"],
+    deps = [
+        ":cuda_activation",
+        ":cuda_gpu_executor",
+        ":cuda_helpers",
+        ":cuda_platform_id",
+        ":cuda_stream",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:rng",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ] + if_static(["@local_config_cuda//cuda:curand"]),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "cuda_kernel",
+    hdrs = ["cuda_kernel.h"],
+    deps = [
+        ":cuda_driver",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
+# TODO(leary) we likely need to canonicalize/eliminate this.
+cc_library(
+    name = "cuda_helpers",
+    textual_hdrs = ["cuda_helpers.h"],
+)
+
+cc_library(
+    name = "cuda_event",
+    srcs = ["cuda_event.cc"],
+    hdrs = ["cuda_event.h"],
+    deps = [
+        ":cuda_driver",
+        ":cuda_gpu_executor_header",
+        ":cuda_stream",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+    ],
+)
+
+cc_library(
+    name = "cuda_stream",
+    srcs = ["cuda_stream.cc"],
+    hdrs = ["cuda_stream.h"],
+    deps = [
+        ":cuda_driver",
+        ":cuda_gpu_executor_header",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor:stream_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "cuda_timer",
+    srcs = ["cuda_timer.cc"],
+    hdrs = ["cuda_timer.h"],
+    deps = [
+        ":cuda_driver",
+        ":cuda_gpu_executor_header",
+        ":cuda_stream",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+    ],
+)
+
+# It implements :cuda_gpu_executor_header
+cc_library(
+    name = "cuda_gpu_executor",
+    srcs = ["cuda_gpu_executor.cc"],
+    hdrs = ["cuda_gpu_executor.h"],
+    deps = [
+        ":cuda_activation",
+        ":cuda_diagnostics",
+        ":cuda_driver",
+        ":cuda_event",
+        ":cuda_kernel",
+        ":cuda_platform_id",
+        ":cuda_stream",
+        ":cuda_timer",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor:timer",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "cudnn_version",
+    srcs = ["cudnn_version.cc"],
+    hdrs = ["cudnn_version.h"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "cudnn_version_test",
+    srcs = ["cudnn_version_test.cc"],
+    deps = [
+        ":cudnn_version",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "all_runtime",
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = if_cuda_is_configured([
+        ":cudnn_plugin",
+        ":cufft_plugin",
+        ":cublas_plugin",
+        ":curand_plugin",
+        ":cuda_driver",
+        ":cuda_platform",
+    ]),
+    alwayslink = 1,
+)
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index 957f6c98da564500f81d7185ce6a151003549ee5..7325476ef16799e70b01234ef79e009ca9194c8f 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -58,16 +58,12 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/cuda/cuda_timer.h"
 #include "tensorflow/stream_executor/device_memory.h"
-
-#ifndef PLATFORM_GOOGLE
-#include "tensorflow/stream_executor/dso_loader.h"
-#endif
-
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/status_macros.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 249bad0c109a9191fa0f653637e255bac89fc970..0bd953fe3b549abe5d5ed45ed4d9b3e3b53863ed 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logger.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
@@ -38,6 +39,8 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/mathutil.h"
 #include "tensorflow/stream_executor/lib/threadpool.h"
+#include "tensorflow/stream_executor/logging.pb.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/scratch_allocator.h"
@@ -48,6 +51,12 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 // clang-format on
 
+#pragma clang diagnostic push
+
+// Make sure that Eigen::half forward declaration in dnn.h matches the
+// declaration in Eigen.
+#pragma clang diagnostic warning "-Wmismatched-tags"
+
 namespace stream_executor {
 namespace cuda {
 
@@ -155,6 +164,156 @@ class CudnnHandle {
 
 }  // namespace
 
+#ifdef PLATFORM_GOOGLE
+// This macro wraps a global identifier, given by __name, in a callable
+// structure that loads the DLL symbol out of the DSO handle in a thread-safe
+// manner on first use. This dynamic loading technique is used to avoid DSO
+// dependencies on vendor libraries which may or may not be available in the
+// deployed binary environment.
+#define STREAM_EXECUTOR_CUDNN_WRAP(__name)   \
+  struct WrapperShim__##__name {             \
+    template <typename... Args>              \
+    cudnnStatus_t operator()(Args... args) { \
+      return ::__name(args...);              \
+    }                                        \
+  } __name;
+
+#else
+#define STREAM_EXECUTOR_CUDNN_WRAP(__name)                                \
+  struct DynLoadShim__##__name {                                          \
+    static const char* kName;                                             \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
+    static void* GetDsoHandle() {                                         \
+      auto s = internal::CachedDsoLoader::GetCudnnDsoHandle();            \
+      return s.ValueOrDie();                                              \
+    }                                                                     \
+    static FuncPtrT LoadOrDie() {                                         \
+      void* f;                                                            \
+      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                          kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                         \
+                    << " in cudnn DSO; dlerror: " << s.error_message();   \
+      return reinterpret_cast<FuncPtrT>(f);                               \
+    }                                                                     \
+    static FuncPtrT DynLoad() {                                           \
+      static FuncPtrT f = LoadOrDie();                                    \
+      return f;                                                           \
+    }                                                                     \
+    template <typename... Args>                                           \
+    cudnnStatus_t operator()(Args... args) {                              \
+      return DynLoad()(args...);                                          \
+    }                                                                     \
+  } __name;                                                               \
+  const char* DynLoadShim__##__name::kName = #__name;
+#endif
+
+// clang-format off
+#define CUDNN_ROUTINE_EACH_V7000_UNDER(__macro)               \
+  __macro(cudnnActivationForward)                             \
+  __macro(cudnnAddTensor)                                     \
+  __macro(cudnnBatchNormalizationBackward)                    \
+  __macro(cudnnBatchNormalizationForwardInference)            \
+  __macro(cudnnBatchNormalizationForwardTraining)             \
+  __macro(cudnnConvolutionBackwardBias)                       \
+  __macro(cudnnConvolutionBackwardData)                       \
+  __macro(cudnnConvolutionBackwardFilter)                     \
+  __macro(cudnnConvolutionBiasActivationForward)              \
+  __macro(cudnnConvolutionForward)                            \
+  __macro(cudnnCreate)                                        \
+  __macro(cudnnCreateActivationDescriptor)                    \
+  __macro(cudnnCreateConvolutionDescriptor)                   \
+  __macro(cudnnCreateDropoutDescriptor)                       \
+  __macro(cudnnCreateFilterDescriptor)                        \
+  __macro(cudnnCreateLRNDescriptor)                           \
+  __macro(cudnnCreatePersistentRNNPlan)                       \
+  __macro(cudnnCreatePoolingDescriptor)                       \
+  __macro(cudnnCreateRNNDescriptor)                           \
+  __macro(cudnnCreateTensorDescriptor)                        \
+  __macro(cudnnDestroy)                                       \
+  __macro(cudnnDestroyActivationDescriptor)                   \
+  __macro(cudnnDestroyConvolutionDescriptor)                  \
+  __macro(cudnnDestroyDropoutDescriptor)                      \
+  __macro(cudnnDestroyFilterDescriptor)                       \
+  __macro(cudnnDestroyLRNDescriptor)                          \
+  __macro(cudnnDestroyPersistentRNNPlan)                      \
+  __macro(cudnnDestroyPoolingDescriptor)                      \
+  __macro(cudnnDestroyRNNDescriptor)                          \
+  __macro(cudnnDestroyTensorDescriptor)                       \
+  __macro(cudnnDropoutGetStatesSize)                          \
+  __macro(cudnnGetActivationDescriptor)                       \
+  __macro(cudnnGetConvolutionBackwardDataAlgorithm)           \
+  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize)       \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithm)         \
+  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize)     \
+  __macro(cudnnGetConvolutionForwardAlgorithm)                \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize)            \
+  __macro(cudnnGetConvolutionNdDescriptor)                    \
+  __macro(cudnnGetConvolutionNdForwardOutputDim)              \
+  __macro(cudnnGetFilterNdDescriptor)                         \
+  __macro(cudnnGetProperty)                                   \
+  __macro(cudnnGetRNNLinLayerBiasParams)                      \
+  __macro(cudnnGetRNNLinLayerMatrixParams)                    \
+  __macro(cudnnGetRNNParamsSize)                              \
+  __macro(cudnnGetRNNTrainingReserveSize)                     \
+  __macro(cudnnGetRNNWorkspaceSize)                           \
+  __macro(cudnnLRNCrossChannelBackward)                       \
+  __macro(cudnnLRNCrossChannelForward)                        \
+  __macro(cudnnPoolingBackward)                               \
+  __macro(cudnnPoolingForward)                                \
+  __macro(cudnnRNNBackwardData)                               \
+  __macro(cudnnRNNBackwardWeights)                            \
+  __macro(cudnnRNNForwardInference)                           \
+  __macro(cudnnRNNForwardTraining)                            \
+  __macro(cudnnSetActivationDescriptor)                       \
+  __macro(cudnnSetConvolutionNdDescriptor)                    \
+  __macro(cudnnSetDropoutDescriptor)                          \
+  __macro(cudnnSetFilterNdDescriptor)                         \
+  __macro(cudnnSetLRNDescriptor)                              \
+  __macro(cudnnSetPersistentRNNPlan)                          \
+  __macro(cudnnSetPoolingNdDescriptor)                        \
+  __macro(cudnnSetRNNDescriptor)                              \
+  __macro(cudnnSetRNNDescriptor_v6)                           \
+  __macro(cudnnSetStream)                                     \
+  __macro(cudnnSetTensor4dDescriptor)                         \
+  __macro(cudnnSetTensorNdDescriptor)                         \
+  __macro(cudnnTransformTensor)
+
+// clang-format on
+
+CUDNN_ROUTINE_EACH_V7000_UNDER(STREAM_EXECUTOR_CUDNN_WRAP)
+#undef CUDNN_ROUTINE_EACH_V7000_UNDER
+
+#if CUDNN_VERSION >= 7000
+// clang-format off
+#define CUDNN_ROUTINE_EACH_V7000(__macro)                    \
+  __macro(cudnnSetRNNMatrixMathType)                         \
+  __macro(cudnnSetConvolutionMathType)                       \
+  __macro(cudnnSetConvolutionGroupCount)
+
+// clang-format on
+
+CUDNN_ROUTINE_EACH_V7000(STREAM_EXECUTOR_CUDNN_WRAP)
+#undef CUDNN_ROUTINE_EACH_V7000
+#endif
+
+#if CUDNN_VERSION >= 7201
+// clang-format off
+#define CUDNN_ROUTINE_EACH_V7210(__macro)                     \
+  __macro(cudnnCreateRNNDataDescriptor)                       \
+  __macro(cudnnDestroyRNNDataDescriptor)                      \
+  __macro(cudnnRNNBackwardDataEx)                             \
+  __macro(cudnnRNNBackwardWeightsEx)                          \
+  __macro(cudnnRNNForwardInferenceEx)                         \
+  __macro(cudnnRNNForwardTrainingEx)                          \
+  __macro(cudnnSetRNNDataDescriptor)                          \
+  __macro(cudnnSetRNNPaddingMode)
+
+// clang-format on
+
+CUDNN_ROUTINE_EACH_V7210(STREAM_EXECUTOR_CUDNN_WRAP)
+#undef CUDNN_ROUTINE_EACH_V7210
+#endif
+
 // Wraps a cuDNN handle and provides access to it through CudnnHandle
 // instances, which also locks a mutex, acquires the CUDA context, and sets
 // the stream that cuDNN should use to enqueue any work.
@@ -1188,14 +1347,21 @@ port::StatusOr<CudnnRnnParamsDescriptor> CudnnRnnParamsDescriptor::Create(
     for (int region = 0; region < region_count_per_layer; region++) {
       for (int type = 0; type < 2; type++) {
         void* offset = nullptr;
-        RETURN_IF_CUDNN_ERROR((type == 0 ? cudnnGetRNNLinLayerMatrixParams
-                                         : cudnnGetRNNLinLayerBiasParams)(
-            /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc,
-            /*layer=*/layer, /*xDesc=*/input_desc.get(),
-            /*wDesc=*/filter_desc.get(),
-            /*w=*/nullptr, /*linLayerID=*/region,
-            /*linLayerMatDesc=*/region_desc_handle.get(),
-            /*linLayerMat or linLayerBias=*/&offset));
+        RETURN_IF_CUDNN_ERROR(
+            type == 0 ? cudnnGetRNNLinLayerMatrixParams(
+                            /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc,
+                            /*layer=*/layer, /*xDesc=*/input_desc.get(),
+                            /*wDesc=*/filter_desc.get(),
+                            /*w=*/nullptr, /*linLayerID=*/region,
+                            /*linLayerMatDesc=*/region_desc_handle.get(),
+                            /*linLayerMat or linLayerBias=*/&offset)
+                      : cudnnGetRNNLinLayerBiasParams(
+                            /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc,
+                            /*layer=*/layer, /*xDesc=*/input_desc.get(),
+                            /*wDesc=*/filter_desc.get(),
+                            /*w=*/nullptr, /*linLayerID=*/region,
+                            /*linLayerMatDesc=*/region_desc_handle.get(),
+                            /*linLayerMat or linLayerBias=*/&offset));
         int dims[] = {1, 1, 1};
         cudnnDataType_t data_type;
         cudnnTensorFormat_t tensor_format;
@@ -1545,7 +1711,8 @@ port::Status CudnnSupport::DoRnnForwardImpl(
           /*y=*/output_data->opaque(),
           /*hyDesc=*/output_h_desc.handle(), /*hy=*/output_h_data->opaque(),
           /*cyDesc=*/output_c_desc.handle(), /*cy=*/output_c_data->opaque(),
-          NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+          nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+          nullptr,
           /*workspace=*/workspace.opaque(),
           /*workSpaceSizeInBytes=*/workspace.size()));
 #else
@@ -1581,7 +1748,8 @@ port::Status CudnnSupport::DoRnnForwardImpl(
           /*y=*/output_data->opaque(),
           /*hyDesc=*/output_h_desc.handle(), /*hy=*/output_h_data->opaque(),
           /*cyDesc=*/output_c_desc.handle(), /*cy=*/output_c_data->opaque(),
-          NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+          nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+          nullptr,
           /*workspace=*/workspace.opaque(),
           /*workSpaceSizeInBytes=*/workspace.size(),
           /*reserveSpace=*/reserve_space.opaque(),
@@ -1679,7 +1847,7 @@ port::Status CudnnSupport::DoRnnBackwardImpl(
         /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
         /*yDesc=*/output_desc.data_handle(), /*y=*/output_data.opaque(),
         /*dyDesc=*/output_desc.data_handle(),
-        /*dy=*/output_backprop_data.opaque(), NULL, NULL,
+        /*dy=*/output_backprop_data.opaque(), nullptr, nullptr,
         /*dhyDesc=*/output_h_desc.handle(),
         /*dhy=*/output_h_backprop_data.opaque(),
         /*dcyDesc=*/output_c_desc.handle(),
@@ -1692,7 +1860,7 @@ port::Status CudnnSupport::DoRnnBackwardImpl(
         /*dhxDesc=*/input_h_desc.handle(),
         /*dhx=*/input_h_backprop_data->opaque(),
         /*dcxDesc=*/input_c_desc.handle(),
-        /*dcx=*/input_c_backprop_data->opaque(), NULL, NULL,
+        /*dcx=*/input_c_backprop_data->opaque(), nullptr, nullptr,
         /*workspace=*/workspace.opaque(),
         /*workSpaceSizeInBytes=*/workspace.size(),
         /*reserveSpace=*/reserve_space_data->opaque(),
@@ -2569,10 +2737,69 @@ bool ShouldIncludeWinogradNonfusedAlgo(
 }
 #endif
 
+template <class ElementType>
+dnn::ConvolutionProto GenerateConvProto(
+    dnn::ConvolutionKind kind, const dnn::BatchDescriptor& input_descriptor,
+    const dnn::FilterDescriptor& filter_descriptor,
+    const dnn::BatchDescriptor& output_descriptor, dnn::AlgorithmDesc algorithm,
+    const dnn::ConvolutionDescriptor& convolution_descriptor, double conv_scale,
+    double side_value_scale, dnn::DataType acc_type,
+    dnn::ActivationMode activation) {
+  dnn::ConvolutionProto conv_config;
+  auto element_type = dnn::ToDataType<ElementType>::value;
+
+  conv_config.set_kind(kind);
+  *conv_config.mutable_input() = input_descriptor.ToProto(element_type);
+  *conv_config.mutable_filter() = filter_descriptor.ToProto(element_type);
+  *conv_config.mutable_output() = output_descriptor.ToProto(element_type);
+  *conv_config.mutable_algorithm() = algorithm.ToProto();
+  *conv_config.mutable_conv_desc() = convolution_descriptor.ToProto();
+  conv_config.mutable_conv_desc()->set_compute_mode(acc_type);
+  conv_config.set_conv_scale(conv_scale);
+  conv_config.set_side_value_scale(side_value_scale);
+  conv_config.set_activation(activation);
+  return conv_config;
+}
+
+void LogCudaProto(const dnn::ConvolutionProto& conv, float profile_time_ms,
+                  StreamExecutor* stream_executor) {
+  {
+    // For rolling-out, temporarily cap the number of logs per process.
+    // TODO(timshen): remove it.
+    static int count_down = 200;
+    if (count_down == 0) {
+      return;
+    }
+    count_down--;
+  }
+
+  ConvLogEntry conv_log;
+  *conv_log.mutable_convolution() = conv;
+  conv_log.set_profile_time_ms(profile_time_ms);
+
+  auto info = conv_log.mutable_cuda_info();
+  int cc_major, cc_minor;
+  stream_executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                                  &cc_minor);
+  info->mutable_compute_capability()->set_major(cc_major);
+  info->mutable_compute_capability()->set_minor(cc_minor);
+
+  if (auto* dnn = stream_executor->AsDnn()) {
+    port::StatusOr<dnn::VersionInfo> version_or = dnn->GetVersion();
+    if (version_or.ok()) {
+      const auto& version = version_or.ValueOrDie();
+      info->mutable_cudnn_version()->set_major(version.major_version());
+      info->mutable_cudnn_version()->set_minor(version.minor_version());
+      info->mutable_cudnn_version()->set_patch(version.patch());
+    }
+  }
+  tensorflow::Logger::Singleton()->LogProto(conv_log);
+}
+
 }  // namespace
 
 template <class T>
-port::Status CudnnSupport::DoConvolveImpl(
+port::Status CudnnSupport::PrepareForConvolutionImpl(
     Stream* stream, const dnn::BatchDescriptor& input_descriptor,
     const DeviceMemory<T>& input_data,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -2581,6 +2808,34 @@ port::Status CudnnSupport::DoConvolveImpl(
     const dnn::BatchDescriptor& output_descriptor, DeviceMemory<T>* output_data,
     dnn::DataType accumulator_type, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
+    dnn::AlgorithmDesc* algorithm_desc, DeviceMemory<uint8>* scratch_memory) {
+  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
+  CudnnTensorDescriptor input_nd(input_descriptor, cudnn_type);
+  CudnnTensorDescriptor output_nd(output_descriptor, cudnn_type);
+  CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
+  CudnnConvolutionDescriptor conv(convolution_descriptor,
+                                  ToCudnnDataType(accumulator_type));
+
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+
+  SE_ASSIGN_OR_RETURN(*algorithm_desc,
+                      GetCudnnConvolutionForwardAlgorithm(
+                          stream, cudnn, algorithm_config, input_nd, filter,
+                          conv, output_nd, scratch_allocator, scratch_memory));
+
+  return port::Status::OK();
+}
+
+template <class T>
+port::Status CudnnSupport::DoConvolveImpl(
+    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+    const DeviceMemory<T>& input_data,
+    const dnn::FilterDescriptor& filter_descriptor,
+    const DeviceMemory<T>& filter_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& output_descriptor, DeviceMemory<T>* output_data,
+    dnn::DataType accumulator_type, const dnn::AlgorithmDesc& algorithm_desc,
+    DeviceMemory<uint8>* scratch_memory,
     dnn::ProfileResult* output_profile_result) {
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
   CudnnTensorDescriptor input_nd(input_descriptor, cudnn_type);
@@ -2603,12 +2858,6 @@ port::Status CudnnSupport::DoConvolveImpl(
 
   const bool is_profiling = output_profile_result != nullptr;
 
-  DeviceMemory<uint8> scratch;
-  SE_ASSIGN_OR_RETURN(dnn::AlgorithmDesc algo_desc,
-                      GetCudnnConvolutionForwardAlgorithm(
-                          stream, cudnn, algorithm_config, input_nd, filter,
-                          conv, output_nd, scratch_allocator, &scratch));
-
   std::unique_ptr<CUDATimer, TimerDeleter> timer;
   if (is_profiling) {
     timer.reset(new CUDATimer(parent_));  // NOLINT
@@ -2624,7 +2873,7 @@ port::Status CudnnSupport::DoConvolveImpl(
   // memory. See nvbugs/2138754, b/80018418.
   if (CUDNN_VERSION < 7300) {
     SE_RETURN_IF_ERROR([&] {
-      if (algo_desc.algo_id() != CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) {
+      if (algorithm_desc.algo_id() != CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) {
         return port::Status::OK();
       }
       if (input_descriptor.ndims() < 3) {
@@ -2649,7 +2898,8 @@ port::Status CudnnSupport::DoConvolveImpl(
     }());
   }
 
-  if (algo_desc.algo_id() == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
+  if (algorithm_desc.algo_id() ==
+          CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
       !ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor)) {
     return port::Status(port::error::FAILED_PRECONDITION,
                         "This configuration has potential integer overflow in "
@@ -2661,18 +2911,26 @@ port::Status CudnnSupport::DoConvolveImpl(
       /*alpha=*/alpha, /*srcDesc=*/input_nd.handle(),
       /*srcData=*/input_data.opaque(), /*filterDesc=*/filter.handle(),
       /*filterData=*/filter_data.opaque(), /*convDesc=*/conv.handle(),
-      /*algo=*/ToConvForwardAlgo(algo_desc), /*workSpace=*/scratch.opaque(),
-      /*workSpaceSizeInBytes=*/scratch.size(), /*beta=*/beta,
+      /*algo=*/ToConvForwardAlgo(algorithm_desc),
+      /*workSpace=*/scratch_memory->opaque(),
+      /*workSpaceSizeInBytes=*/scratch_memory->size(), /*beta=*/beta,
       /*yDesc=*/output_nd.handle(), /*y=*/output_data->opaque()));
 
   if (is_profiling) {
     if (!timer->Stop(AsCUDAStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
-    output_profile_result->set_algorithm(algo_desc);
+    output_profile_result->set_algorithm(algorithm_desc);
     output_profile_result->set_elapsed_time_in_ms(
         timer->GetElapsedMilliseconds());
-    output_profile_result->set_scratch_size(scratch.size());
+    output_profile_result->set_scratch_size(scratch_memory->size());
+
+    LogCudaProto(
+        GenerateConvProto<T>(
+            dnn::ConvolutionKind::FORWARD, input_descriptor, filter_descriptor,
+            output_descriptor, algorithm_desc, convolution_descriptor, dalpha,
+            dbeta, accumulator_type, dnn::ActivationMode::kNone),
+        output_profile_result->elapsed_time_in_ms(), stream->parent());
   }
 
   return port::Status::OK();
@@ -2790,6 +3048,13 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
     output_profile_result->set_elapsed_time_in_ms(
         timer->GetElapsedMilliseconds());
     output_profile_result->set_scratch_size(scratch.size());
+
+    LogCudaProto(GenerateConvProto<ElementType>(
+                     dnn::ConvolutionKind::FORWARD, conv_input_descriptor,
+                     filter_descriptor, output_descriptor, algo_desc,
+                     convolution_descriptor, conv_input_scale, side_input_scale,
+                     accumulator_type, activation_mode),
+                 output_profile_result->elapsed_time_in_ms(), stream->parent());
   }
 
   return port::Status::OK();
@@ -3069,7 +3334,7 @@ port::Status CudnnSupport::DoBatchNormalizationBackwardImpl(
   return port::Status::OK();
 }
 
-bool CudnnSupport::DoConvolve(
+bool CudnnSupport::PrepareForConvolution(
     Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
     const DeviceMemory<float>& input_data,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -3078,12 +3343,70 @@ bool CudnnSupport::DoConvolve(
     const dnn::BatchDescriptor& output_descriptor,
     DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
+    dnn::AlgorithmDesc* algorithm_desc, DeviceMemory<uint8>* scratch_memory) {
+  return IsStatusOk(PrepareForConvolutionImpl<float>(
+                        stream, batch_descriptor, input_data, filter_descriptor,
+                        filter_data, convolution_descriptor, output_descriptor,
+                        output_data, dnn::DataType::kFloat, scratch_allocator,
+                        algorithm_config, algorithm_desc, scratch_memory),
+                    /*report_error=*/true);
+}
+
+bool CudnnSupport::PrepareForConvolution(
+    Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
+    const DeviceMemory<double>& input_data,
+    const dnn::FilterDescriptor& filter_descriptor,
+    const DeviceMemory<double>& filter_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
+    const dnn::AlgorithmConfig& algorithm_config,
+    dnn::AlgorithmDesc* algorithm_desc, DeviceMemory<uint8>* scratch_memory) {
+  return IsStatusOk(PrepareForConvolutionImpl<double>(
+                        stream, batch_descriptor, input_data, filter_descriptor,
+                        filter_data, convolution_descriptor, output_descriptor,
+                        output_data, dnn::DataType::kDouble, scratch_allocator,
+                        algorithm_config, algorithm_desc, scratch_memory),
+                    /*report_error=*/true);
+}
+
+bool CudnnSupport::PrepareForConvolution(
+    Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
+    const DeviceMemory<Eigen::half>& input_data,
+    const dnn::FilterDescriptor& filter_descriptor,
+    const DeviceMemory<Eigen::half>& filter_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
+    const dnn::AlgorithmConfig& algorithm_config,
+    dnn::AlgorithmDesc* algorithm_desc, DeviceMemory<uint8>* scratch_memory) {
+  dnn::DataType acc_type =
+      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
+          ? dnn::DataType::kFloat
+          : dnn::DataType::kHalf;
+  return IsStatusOk(
+      PrepareForConvolutionImpl<Eigen::half>(
+          stream, batch_descriptor, input_data, filter_descriptor, filter_data,
+          convolution_descriptor, output_descriptor, output_data, acc_type,
+          scratch_allocator, algorithm_config, algorithm_desc, scratch_memory),
+      /*report_error=*/true);
+}
+
+bool CudnnSupport::DoConvolve(
+    Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
+    const DeviceMemory<float>& input_data,
+    const dnn::FilterDescriptor& filter_descriptor,
+    const DeviceMemory<float>& filter_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemory<float>* output_data, const dnn::AlgorithmDesc& algorithm_desc,
+    DeviceMemory<uint8>* scratch_memory,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
       DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor,
                      filter_data, convolution_descriptor, output_descriptor,
-                     output_data, dnn::DataType::kFloat, scratch_allocator,
-                     algorithm_config, output_profile_result),
+                     output_data, dnn::DataType::kFloat, algorithm_desc,
+                     scratch_memory, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3094,14 +3417,14 @@ bool CudnnSupport::DoConvolve(
     const DeviceMemory<double>& filter_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
     const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
+    DeviceMemory<double>* output_data, const dnn::AlgorithmDesc& algorithm_desc,
+    DeviceMemory<uint8>* scratch_memory,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
       DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor,
                      filter_data, convolution_descriptor, output_descriptor,
-                     output_data, dnn::DataType::kDouble, scratch_allocator,
-                     algorithm_config, output_profile_result),
+                     output_data, dnn::DataType::kDouble, algorithm_desc,
+                     scratch_memory, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3112,8 +3435,9 @@ bool CudnnSupport::DoConvolve(
     const DeviceMemory<Eigen::half>& filter_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
     const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
+    DeviceMemory<Eigen::half>* output_data,
+    const dnn::AlgorithmDesc& algorithm_desc,
+    DeviceMemory<uint8>* scratch_memory,
     dnn::ProfileResult* output_profile_result) {
   dnn::DataType acc_type =
       CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
@@ -3122,7 +3446,7 @@ bool CudnnSupport::DoConvolve(
   return IsStatusOk(
       DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor,
                      filter_data, convolution_descriptor, output_descriptor,
-                     output_data, acc_type, scratch_allocator, algorithm_config,
+                     output_data, acc_type, algorithm_desc, scratch_memory,
                      output_profile_result),
       /*report_error=*/!output_profile_result);
 }
@@ -3258,7 +3582,7 @@ bool CudnnSupport::DoTransformTensor(Stream* stream,
 }
 
 template <class T>
-port::Status CudnnSupport::DoConvolveBackwardDataImpl(
+port::Status CudnnSupport::PrepareForConvolutionBackwardDataImpl(
     Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
     const DeviceMemory<T>& filter_data,
     const dnn::BatchDescriptor& output_descriptor,
@@ -3268,6 +3592,36 @@ port::Status CudnnSupport::DoConvolveBackwardDataImpl(
     DeviceMemory<T>* backward_input_data, dnn::DataType accumulator_type,
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
+    dnn::AlgorithmDesc* algorithm_desc, DeviceMemory<uint8>* scratch_memory) {
+  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+
+  CudnnTensorDescriptor out_back_nd(output_descriptor, cudnn_type);
+  CudnnTensorDescriptor in_back_nd(input_descriptor, cudnn_type);
+  CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
+  CudnnConvolutionDescriptor conv(convolution_descriptor,
+                                  ToCudnnDataType(accumulator_type));
+
+  SE_ASSIGN_OR_RETURN(
+      *algorithm_desc,
+      GetCudnnConvolutionBackwardDataAlgorithm(
+          stream, cudnn, algorithm_config, in_back_nd, filter, conv,
+          out_back_nd, scratch_allocator, scratch_memory));
+
+  return port::Status::OK();
+}
+
+template <class T>
+port::Status CudnnSupport::DoConvolveBackwardDataImpl(
+    Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
+    const DeviceMemory<T>& filter_data,
+    const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemory<T> backward_output_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& input_descriptor,
+    DeviceMemory<T>* backward_input_data, dnn::DataType accumulator_type,
+    const dnn::AlgorithmDesc& algorithm_desc,
+    DeviceMemory<uint8>* scratch_memory,
     dnn::ProfileResult* output_profile_result) {
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
   // Alpha is the scaling factor for input.
@@ -3291,12 +3645,6 @@ port::Status CudnnSupport::DoConvolveBackwardDataImpl(
 
   const bool is_profiling = output_profile_result != nullptr;
 
-  DeviceMemory<uint8> scratch;
-  SE_ASSIGN_OR_RETURN(dnn::AlgorithmDesc algo_desc,
-                      GetCudnnConvolutionBackwardDataAlgorithm(
-                          stream, cudnn, algorithm_config, in_back_nd, filter,
-                          conv, out_back_nd, scratch_allocator, &scratch));
-
   std::unique_ptr<CUDATimer, TimerDeleter> timer;
   if (is_profiling) {
     timer.reset(new CUDATimer(parent_));  // NOLINT
@@ -3308,7 +3656,8 @@ port::Status CudnnSupport::DoConvolveBackwardDataImpl(
     }
   }
 
-  if (algo_desc.algo_id() == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
+  if (algorithm_desc.algo_id() ==
+          CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
       !ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor)) {
     return port::Status(port::error::FAILED_PRECONDITION,
                         "This configuration has potential integer overflow in "
@@ -3318,44 +3667,51 @@ port::Status CudnnSupport::DoConvolveBackwardDataImpl(
   // Cudnn 7.1.4 has a bug if the workspace of the following convolution is not
   // zero-initialized, nvbugs/2254619.
   if (CUDNN_VERSION >= 7000 && CUDNN_VERSION < 7300 &&
-      algo_desc.algo_id() == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 &&
-      cudnn_type == CUDNN_DATA_HALF && algo_desc.tensor_ops_enabled() &&
+      algorithm_desc.algo_id() == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 &&
+      cudnn_type == CUDNN_DATA_HALF && algorithm_desc.tensor_ops_enabled() &&
       input_descriptor.layout() == dnn::DataLayout::kBatchYXDepth &&
       filter_descriptor.layout() == dnn::FilterLayout::kOutputInputYX &&
       output_descriptor.layout() == dnn::DataLayout::kBatchDepthYX &&
       (convolution_descriptor.vertical_filter_stride() > 1 ||
        convolution_descriptor.horizontal_filter_stride() > 1)) {
-    stream->ThenMemZero(&scratch, scratch.size());
+    stream->ThenMemZero(scratch_memory, scratch_memory->size());
   }
 
-  RETURN_IF_CUDNN_ERROR(
-      cudnnConvolutionBackwardData(cudnn.handle(),
-                                   /*alpha=*/alpha,
-                                   /*wDesc=*/filter.handle(),
-                                   /*w=*/filter_data.opaque(),
-                                   /*dyDesc=*/out_back_nd.handle(),
-                                   /*dy=*/backward_output_data.opaque(),
-                                   /*convDesc=*/conv.handle(),
-                                   /*algo=*/ToConvBackwardDataAlgo(algo_desc),
-                                   /*workSpace=*/scratch.opaque(),
-                                   /*workSpaceSizeInBytes=*/scratch.size(),
-                                   /*beta=*/beta,
-                                   /*dxDesc=*/in_back_nd.handle(),
-                                   /*dx=*/backward_input_data->opaque()));
+  RETURN_IF_CUDNN_ERROR(cudnnConvolutionBackwardData(
+      cudnn.handle(),
+      /*alpha=*/alpha,
+      /*wDesc=*/filter.handle(),
+      /*w=*/filter_data.opaque(),
+      /*dyDesc=*/out_back_nd.handle(),
+      /*dy=*/backward_output_data.opaque(),
+      /*convDesc=*/conv.handle(),
+      /*algo=*/ToConvBackwardDataAlgo(algorithm_desc),
+      /*workSpace=*/scratch_memory->opaque(),
+      /*workSpaceSizeInBytes=*/scratch_memory->size(),
+      /*beta=*/beta,
+      /*dxDesc=*/in_back_nd.handle(),
+      /*dx=*/backward_input_data->opaque()));
   if (is_profiling) {
     if (!timer->Stop(AsCUDAStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
-    output_profile_result->set_algorithm(algo_desc);
+    output_profile_result->set_algorithm(algorithm_desc);
     output_profile_result->set_elapsed_time_in_ms(
         timer->GetElapsedMilliseconds());
-    output_profile_result->set_scratch_size(scratch.size());
+    output_profile_result->set_scratch_size(scratch_memory->size());
+
+    LogCudaProto(GenerateConvProto<T>(
+                     dnn::ConvolutionKind::BACKWARD_DATA, input_descriptor,
+                     filter_descriptor, output_descriptor, algorithm_desc,
+                     convolution_descriptor, dalpha, dbeta, accumulator_type,
+                     dnn::ActivationMode::kNone),
+                 output_profile_result->elapsed_time_in_ms(), stream->parent());
   }
 
   return port::Status::OK();
 }
 
-bool CudnnSupport::DoConvolveBackwardData(
+bool CudnnSupport::PrepareForConvolutionBackwardData(
     Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
     const DeviceMemory<double>& filter_data,
     const dnn::BatchDescriptor& output_descriptor,
@@ -3365,17 +3721,17 @@ bool CudnnSupport::DoConvolveBackwardData(
     DeviceMemory<double>* backward_input_data,
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
+    dnn::AlgorithmDesc* algorithm_desc, DeviceMemory<uint8>* scratch_memory) {
   return IsStatusOk(
-      DoConvolveBackwardDataImpl(
+      PrepareForConvolutionBackwardDataImpl(
           stream, filter_descriptor, filter_data, output_descriptor,
           backward_output_data, convolution_descriptor, input_descriptor,
           backward_input_data, dnn::DataType::kDouble, scratch_allocator,
-          algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
+          algorithm_config, algorithm_desc, scratch_memory),
+      /*report_error=*/true);
 }
 
-bool CudnnSupport::DoConvolveBackwardData(
+bool CudnnSupport::PrepareForConvolutionBackwardData(
     Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
     const DeviceMemory<float>& filter_data,
     const dnn::BatchDescriptor& output_descriptor,
@@ -3385,17 +3741,17 @@ bool CudnnSupport::DoConvolveBackwardData(
     DeviceMemory<float>* backward_input_data,
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
+    dnn::AlgorithmDesc* algorithm_desc, DeviceMemory<uint8>* scratch_memory) {
   return IsStatusOk(
-      DoConvolveBackwardDataImpl(
+      PrepareForConvolutionBackwardDataImpl(
           stream, filter_descriptor, filter_data, output_descriptor,
           backward_output_data, convolution_descriptor, input_descriptor,
           backward_input_data, dnn::DataType::kFloat, scratch_allocator,
-          algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
+          algorithm_config, algorithm_desc, scratch_memory),
+      /*report_error=*/true);
 }
 
-bool CudnnSupport::DoConvolveBackwardData(
+bool CudnnSupport::PrepareForConvolutionBackwardData(
     Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
     const DeviceMemory<Eigen::half>& filter_data,
     const dnn::BatchDescriptor& output_descriptor,
@@ -3405,22 +3761,86 @@ bool CudnnSupport::DoConvolveBackwardData(
     DeviceMemory<Eigen::half>* backward_input_data,
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
+    dnn::AlgorithmDesc* algorithm_desc, DeviceMemory<uint8>* scratch_memory) {
   dnn::DataType acc_type =
       CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
           ? dnn::DataType::kFloat
           : dnn::DataType::kHalf;
   return IsStatusOk(
-      DoConvolveBackwardDataImpl(
+      PrepareForConvolutionBackwardDataImpl(
           stream, filter_descriptor, filter_data, output_descriptor,
           backward_output_data, convolution_descriptor, input_descriptor,
           backward_input_data, acc_type, scratch_allocator, algorithm_config,
-          output_profile_result),
+          algorithm_desc, scratch_memory),
+      /*report_error=*/true);
+}
+
+bool CudnnSupport::DoConvolveBackwardData(
+    Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
+    const DeviceMemory<double>& filter_data,
+    const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemory<double> backward_output_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& input_descriptor,
+    DeviceMemory<double>* backward_input_data,
+    const dnn::AlgorithmDesc& algorithm_desc,
+    DeviceMemory<uint8>* scratch_memory,
+    dnn::ProfileResult* output_profile_result) {
+  return IsStatusOk(
+      DoConvolveBackwardDataImpl(
+          stream, filter_descriptor, filter_data, output_descriptor,
+          backward_output_data, convolution_descriptor, input_descriptor,
+          backward_input_data, dnn::DataType::kDouble, algorithm_desc,
+          scratch_memory, output_profile_result),
+      /*report_error=*/!output_profile_result);
+}
+
+bool CudnnSupport::DoConvolveBackwardData(
+    Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
+    const DeviceMemory<float>& filter_data,
+    const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemory<float> backward_output_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& input_descriptor,
+    DeviceMemory<float>* backward_input_data,
+    const dnn::AlgorithmDesc& algorithm_desc,
+    DeviceMemory<uint8>* scratch_memory,
+    dnn::ProfileResult* output_profile_result) {
+  return IsStatusOk(
+      DoConvolveBackwardDataImpl(
+          stream, filter_descriptor, filter_data, output_descriptor,
+          backward_output_data, convolution_descriptor, input_descriptor,
+          backward_input_data, dnn::DataType::kFloat, algorithm_desc,
+          scratch_memory, output_profile_result),
+      /*report_error=*/!output_profile_result);
+}
+
+bool CudnnSupport::DoConvolveBackwardData(
+    Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
+    const DeviceMemory<Eigen::half>& filter_data,
+    const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemory<Eigen::half> backward_output_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& input_descriptor,
+    DeviceMemory<Eigen::half>* backward_input_data,
+    const dnn::AlgorithmDesc& algorithm_desc,
+    DeviceMemory<uint8>* scratch_memory,
+    dnn::ProfileResult* output_profile_result) {
+  dnn::DataType acc_type =
+      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
+          ? dnn::DataType::kFloat
+          : dnn::DataType::kHalf;
+  return IsStatusOk(
+      DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
+                                 output_descriptor, backward_output_data,
+                                 convolution_descriptor, input_descriptor,
+                                 backward_input_data, acc_type, algorithm_desc,
+                                 scratch_memory, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
 template <class T>
-port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
+port::Status CudnnSupport::PrepareForConvolutionBackwardFilterImpl(
     Stream* stream, const dnn::BatchDescriptor& input_descriptor,
     const DeviceMemory<T>& input_data,
     const dnn::BatchDescriptor& output_descriptor,
@@ -3430,6 +3850,36 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
     DeviceMemory<T>* backward_filter_data, dnn::DataType accumulator_type,
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
+    dnn::AlgorithmDesc* algorithm_desc, DeviceMemory<uint8>* scratch_memory) {
+  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+
+  CudnnTensorDescriptor out_back_nd(output_descriptor, cudnn_type);
+  CudnnTensorDescriptor input_nd(input_descriptor, cudnn_type);
+  CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
+  CudnnConvolutionDescriptor conv(convolution_descriptor,
+                                  ToCudnnDataType(accumulator_type));
+
+  SE_ASSIGN_OR_RETURN(
+      *algorithm_desc,
+      GetCudnnConvolutionBackwardFilterAlgorithm(
+          stream, cudnn, algorithm_config, input_nd, filter, conv, out_back_nd,
+          scratch_allocator, scratch_memory));
+
+  return port::Status::OK();
+}
+
+template <class T>
+port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
+    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+    const DeviceMemory<T>& input_data,
+    const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemory<T> backward_output_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::FilterDescriptor& filter_descriptor,
+    DeviceMemory<T>* backward_filter_data, dnn::DataType accumulator_type,
+    const dnn::AlgorithmDesc& algorithm_desc,
+    DeviceMemory<uint8>* scratch_memory,
     dnn::ProfileResult* output_profile_result) {
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
   // Alpha is the scaling factor for input.
@@ -3453,12 +3903,6 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
 
   const bool is_profiling = output_profile_result != nullptr;
 
-  DeviceMemory<uint8> scratch;
-  SE_ASSIGN_OR_RETURN(dnn::AlgorithmDesc algo_desc,
-                      GetCudnnConvolutionBackwardFilterAlgorithm(
-                          stream, cudnn, algorithm_config, input_nd, filter,
-                          conv, out_back_nd, scratch_allocator, &scratch));
-
   std::unique_ptr<CUDATimer, TimerDeleter> timer;
   if (is_profiling) {
     timer.reset(new CUDATimer(parent_));  // NOLINT
@@ -3474,7 +3918,8 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
   // results. See nvbugs/2072856
   if (CUDNN_VERSION < 7300) {
     SE_RETURN_IF_ERROR([&] {
-      if (algo_desc.algo_id() != CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING) {
+      if (algorithm_desc.algo_id() !=
+          CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING) {
         return port::Status::OK();
       }
       if (output_descriptor.height() > 1 && output_descriptor.width() > 1) {
@@ -3500,7 +3945,8 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
     }());
   }
 
-  if (algo_desc.algo_id() == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
+  if (algorithm_desc.algo_id() ==
+          CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
       !ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor)) {
     return port::Status(port::error::FAILED_PRECONDITION,
                         "This configuration has potential integer overflow in "
@@ -3516,7 +3962,7 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
   //
   // See nvbugs/2379553.
   if (CUDNN_VERSION >= 7100 && CUDNN_VERSION < 7300 &&
-      algo_desc.algo_id() == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 &&
+      algorithm_desc.algo_id() == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 &&
       cudnn_type == CUDNN_DATA_HALF &&
       input_descriptor.layout() == dnn::DataLayout::kBatchYXDepth &&
       filter_descriptor.layout() == dnn::FilterLayout::kOutputYXInput &&
@@ -3534,9 +3980,9 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
       /*diffDesc=*/out_back_nd.handle(),
       /*diffData=*/backward_output_data.opaque(),
       /*convDesc=*/conv.handle(),
-      /*algo=*/ToConvBackwardFilterAlgo(algo_desc),
-      /*workSpace=*/scratch.opaque(),
-      /*workSpaceSizeInBytes=*/scratch.size(),
+      /*algo=*/ToConvBackwardFilterAlgo(algorithm_desc),
+      /*workSpace=*/scratch_memory->opaque(),
+      /*workSpaceSizeInBytes=*/scratch_memory->size(),
       /*beta=*/beta,
       /*gradDesc=*/filter.handle(),
       /*dw=*/backward_filter_data->opaque()));
@@ -3544,16 +3990,23 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
     if (!timer->Stop(AsCUDAStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
-    output_profile_result->set_algorithm(algo_desc);
+    output_profile_result->set_algorithm(algorithm_desc);
     output_profile_result->set_elapsed_time_in_ms(
         timer->GetElapsedMilliseconds());
-    output_profile_result->set_scratch_size(scratch.size());
+    output_profile_result->set_scratch_size(scratch_memory->size());
+
+    LogCudaProto(GenerateConvProto<T>(
+                     dnn::ConvolutionKind::BACKWARD_FILTER, input_descriptor,
+                     filter_descriptor, output_descriptor, algorithm_desc,
+                     convolution_descriptor, dalpha, dbeta, accumulator_type,
+                     dnn::ActivationMode::kNone),
+                 output_profile_result->elapsed_time_in_ms(), stream->parent());
   }
 
   return port::Status::OK();
 }
 
-bool CudnnSupport::DoConvolveBackwardFilter(
+bool CudnnSupport::PrepareForConvolutionBackwardFilter(
     Stream* stream, const dnn::BatchDescriptor& input_descriptor,
     const DeviceMemory<double>& input_data,
     const dnn::BatchDescriptor& output_descriptor,
@@ -3563,18 +4016,17 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     DeviceMemory<double>* backward_filter_data,
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
+    dnn::AlgorithmDesc* algorithm_desc, DeviceMemory<uint8>* scratch_memory) {
   return IsStatusOk(
-      DoConvolveBackwardFilterImpl(
+      PrepareForConvolutionBackwardFilterImpl(
           stream, input_descriptor, input_data, output_descriptor,
           backward_output_data, convolution_descriptor, filter_descriptor,
-          backward_filter_data, dnn::DataType::kDouble,
-
-          scratch_allocator, algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
+          backward_filter_data, dnn::DataType::kDouble, scratch_allocator,
+          algorithm_config, algorithm_desc, scratch_memory),
+      /*report_error=*/true);
 }
 
-bool CudnnSupport::DoConvolveBackwardFilter(
+bool CudnnSupport::PrepareForConvolutionBackwardFilter(
     Stream* stream, const dnn::BatchDescriptor& input_descriptor,
     const DeviceMemory<float>& input_data,
     const dnn::BatchDescriptor& output_descriptor,
@@ -3584,18 +4036,17 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     DeviceMemory<float>* backward_filter_data,
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(DoConvolveBackwardFilterImpl(
-                        stream, input_descriptor, input_data, output_descriptor,
-                        backward_output_data, convolution_descriptor,
-                        filter_descriptor, backward_filter_data,
-
-                        dnn::DataType::kFloat, scratch_allocator,
-                        algorithm_config, output_profile_result),
-                    /*report_error=*/!output_profile_result);
+    dnn::AlgorithmDesc* algorithm_desc, DeviceMemory<uint8>* scratch_memory) {
+  return IsStatusOk(
+      PrepareForConvolutionBackwardFilterImpl(
+          stream, input_descriptor, input_data, output_descriptor,
+          backward_output_data, convolution_descriptor, filter_descriptor,
+          backward_filter_data, dnn::DataType::kFloat, scratch_allocator,
+          algorithm_config, algorithm_desc, scratch_memory),
+      /*report_error=*/true);
 }
 
-bool CudnnSupport::DoConvolveBackwardFilter(
+bool CudnnSupport::PrepareForConvolutionBackwardFilter(
     Stream* stream, const dnn::BatchDescriptor& input_descriptor,
     const DeviceMemory<Eigen::half>& input_data,
     const dnn::BatchDescriptor& output_descriptor,
@@ -3605,20 +4056,83 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     DeviceMemory<Eigen::half>* backward_filter_data,
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
+    dnn::AlgorithmDesc* algorithm_desc, DeviceMemory<uint8>* scratch_memory) {
   dnn::DataType acc_type =
       CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
           ? dnn::DataType::kFloat
           : dnn::DataType::kHalf;
   return IsStatusOk(
-      DoConvolveBackwardFilterImpl(
+      PrepareForConvolutionBackwardFilterImpl(
           stream, input_descriptor, input_data, output_descriptor,
           backward_output_data, convolution_descriptor, filter_descriptor,
           backward_filter_data, acc_type, scratch_allocator, algorithm_config,
-          output_profile_result),
+          algorithm_desc, scratch_memory),
+      /*report_error=*/true);
+}
+
+bool CudnnSupport::DoConvolveBackwardFilter(
+    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+    const DeviceMemory<double>& input_data,
+    const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemory<double> backward_output_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::FilterDescriptor& filter_descriptor,
+    DeviceMemory<double>* backward_filter_data,
+    const dnn::AlgorithmDesc& algorithm_desc,
+    DeviceMemory<uint8>* scratch_memory,
+    dnn::ProfileResult* output_profile_result) {
+  return IsStatusOk(
+      DoConvolveBackwardFilterImpl(
+          stream, input_descriptor, input_data, output_descriptor,
+          backward_output_data, convolution_descriptor, filter_descriptor,
+          backward_filter_data, dnn::DataType::kDouble, algorithm_desc,
+          scratch_memory, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
+bool CudnnSupport::DoConvolveBackwardFilter(
+    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+    const DeviceMemory<float>& input_data,
+    const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemory<float> backward_output_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::FilterDescriptor& filter_descriptor,
+    DeviceMemory<float>* backward_filter_data,
+    const dnn::AlgorithmDesc& algorithm_desc,
+    DeviceMemory<uint8>* scratch_memory,
+    dnn::ProfileResult* output_profile_result) {
+  return IsStatusOk(
+      DoConvolveBackwardFilterImpl(
+          stream, input_descriptor, input_data, output_descriptor,
+          backward_output_data, convolution_descriptor, filter_descriptor,
+          backward_filter_data, dnn::DataType::kFloat, algorithm_desc,
+          scratch_memory, output_profile_result),
+      /*report_error=*/!output_profile_result);
+}
+
+bool CudnnSupport::DoConvolveBackwardFilter(
+    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+    const DeviceMemory<Eigen::half>& input_data,
+    const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemory<Eigen::half> backward_output_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::FilterDescriptor& filter_descriptor,
+    DeviceMemory<Eigen::half>* backward_filter_data,
+    const dnn::AlgorithmDesc& algorithm_desc,
+    DeviceMemory<uint8>* scratch_memory,
+    dnn::ProfileResult* output_profile_result) {
+  dnn::DataType acc_type =
+      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
+          ? dnn::DataType::kFloat
+          : dnn::DataType::kHalf;
+  return IsStatusOk(DoConvolveBackwardFilterImpl(
+                        stream, input_descriptor, input_data, output_descriptor,
+                        backward_output_data, convolution_descriptor,
+                        filter_descriptor, backward_filter_data, acc_type,
+                        algorithm_desc, scratch_memory, output_profile_result),
+                    /*report_error=*/!output_profile_result);
+}
+
 template <class T>
 port::Status CudnnSupport::DoConvolveBackwardBiasImpl(
     Stream* stream, const dnn::BatchDescriptor& input_descriptor,
@@ -3952,6 +4466,31 @@ bool CudnnSupport::DoPoolForward(
   return IsStatusOk(status, /*report_error=*/true);
 }
 
+bool CudnnSupport::DoPoolForward(
+    Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
+    const dnn::BatchDescriptor& input_dimensions,
+    const DeviceMemory<int8>& input_data,
+    const dnn::BatchDescriptor& output_dimensions,
+    DeviceMemory<int8>* output_data, ScratchAllocator* workspace_allocator) {
+  // Alpha is the scaling factor for input.
+  float alpha = 1.0;
+  // Beta is the scaling factor for output.
+  float beta = 0.0;
+
+  CudnnTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_INT8);
+  CudnnTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_INT8);
+  CudnnPoolingDescriptor pooling_desc(pooling_dimensions);
+
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnPoolingForward(
+        cudnn.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
+        input_data.opaque(), &beta, dest_desc.handle(), output_data->opaque()));
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
+}
+
 bool CudnnSupport::DoPoolBackward(
     Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
     const dnn::BatchDescriptor& input_dimensions,
@@ -4282,5 +4821,7 @@ void initialize_cudnn() {
 
 }  // namespace stream_executor
 
+#pragma clang diagnostic pop
+
 REGISTER_MODULE_INITIALIZER(register_cudnn,
                             { stream_executor::initialize_cudnn(); });
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 044ed545145bdc521c32225a0e95f9dd63eace69..d7514981d569a80feea6720dd4463f0b29f01633 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -258,6 +258,43 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<float>* scale_backprop,
       DeviceMemory<float>* offset_backprop) override;
 
+  bool PrepareForConvolution(
+      Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
+      const DeviceMemory<float>& input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<float>& filter_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::AlgorithmDesc* algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory) override;
+
+  bool PrepareForConvolution(
+      Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
+      const DeviceMemory<double>& input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<double>& filter_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::AlgorithmDesc* algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory) override;
+
+  bool PrepareForConvolution(
+      Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
+      const DeviceMemory<Eigen::half>& input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<Eigen::half>& filter_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<Eigen::half>* output_data,
+      ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::AlgorithmDesc* algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory) override;
+
   bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
                   const DeviceMemory<float>& input_data,
                   const dnn::FilterDescriptor& filter_descriptor,
@@ -265,8 +302,8 @@ class CudnnSupport : public dnn::DnnSupport {
                   const dnn::ConvolutionDescriptor& convolution_descriptor,
                   const dnn::BatchDescriptor& output_descriptor,
                   DeviceMemory<float>* output_data,
-                  ScratchAllocator* scratch_allocator,
-                  const dnn::AlgorithmConfig& algorithm_config,
+                  const dnn::AlgorithmDesc& algorithm_desc,
+                  DeviceMemory<uint8>* scratch_memory,
                   dnn::ProfileResult* output_profile_result) override;
 
   bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
@@ -276,8 +313,8 @@ class CudnnSupport : public dnn::DnnSupport {
                   const dnn::ConvolutionDescriptor& convolution_descriptor,
                   const dnn::BatchDescriptor& output_descriptor,
                   DeviceMemory<double>* output_data,
-                  ScratchAllocator* scratch_allocator,
-                  const dnn::AlgorithmConfig& algorithm_config,
+                  const dnn::AlgorithmDesc& algorithm_desc,
+                  DeviceMemory<uint8>* scratch_memory,
                   dnn::ProfileResult* output_profile_result) override;
 
   bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
@@ -287,8 +324,8 @@ class CudnnSupport : public dnn::DnnSupport {
                   const dnn::ConvolutionDescriptor& convolution_descriptor,
                   const dnn::BatchDescriptor& output_descriptor,
                   DeviceMemory<Eigen::half>* output_data,
-                  ScratchAllocator* scratch_allocator,
-                  const dnn::AlgorithmConfig& algorithm_config,
+                  const dnn::AlgorithmDesc& algorithm_desc,
+                  DeviceMemory<uint8>* scratch_memory,
                   dnn::ProfileResult* output_profile_result) override;
 
   bool DoFusedConvolve(
@@ -390,7 +427,20 @@ class CudnnSupport : public dnn::DnnSupport {
     return false;
   }
 
-  bool DoConvolveBackwardData(
+  bool PrepareForConvolutionBackwardData(
+      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<float>& filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<float> backward_output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& input_descriptor,
+      DeviceMemory<float>* backward_input_data,
+      ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::AlgorithmDesc* algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory) override;
+
+  bool PrepareForConvolutionBackwardData(
       Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
       const DeviceMemory<double>& filter_data,
       const dnn::BatchDescriptor& output_descriptor,
@@ -400,6 +450,32 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<double>* backward_input_data,
       ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
+      dnn::AlgorithmDesc* algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory) override;
+
+  bool PrepareForConvolutionBackwardData(
+      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<Eigen::half>& filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<Eigen::half> backward_output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& input_descriptor,
+      DeviceMemory<Eigen::half>* backward_input_data,
+      ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::AlgorithmDesc* algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory) override;
+
+  bool DoConvolveBackwardData(
+      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<double>& filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<double> backward_output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& input_descriptor,
+      DeviceMemory<double>* backward_input_data,
+      const dnn::AlgorithmDesc& algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory,
       dnn::ProfileResult* output_profile_result) override;
 
   bool DoConvolveBackwardData(
@@ -410,8 +486,8 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       const dnn::BatchDescriptor& input_descriptor,
       DeviceMemory<float>* backward_input_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
+      const dnn::AlgorithmDesc& algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory,
       dnn::ProfileResult* output_profile_result) override;
 
   bool DoConvolveBackwardData(
@@ -422,11 +498,11 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       const dnn::BatchDescriptor& input_descriptor,
       DeviceMemory<Eigen::half>* backward_input_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
+      const dnn::AlgorithmDesc& algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory,
       dnn::ProfileResult* output_profile_result) override;
 
-  bool DoConvolveBackwardFilter(
+  bool PrepareForConvolutionBackwardFilter(
       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
       const DeviceMemory<double>& input_data,
       const dnn::BatchDescriptor& output_descriptor,
@@ -436,9 +512,10 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<double>* backward_filter_data,
       ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result) override;
+      dnn::AlgorithmDesc* algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory) override;
 
-  bool DoConvolveBackwardFilter(
+  bool PrepareForConvolutionBackwardFilter(
       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
       const DeviceMemory<float>& input_data,
       const dnn::BatchDescriptor& output_descriptor,
@@ -448,9 +525,10 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<float>* backward_filter_data,
       ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result) override;
+      dnn::AlgorithmDesc* algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory) override;
 
-  bool DoConvolveBackwardFilter(
+  bool PrepareForConvolutionBackwardFilter(
       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
       const DeviceMemory<Eigen::half>& input_data,
       const dnn::BatchDescriptor& output_descriptor,
@@ -460,6 +538,43 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<Eigen::half>* backward_filter_data,
       ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
+      dnn::AlgorithmDesc* algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory) override;
+
+  bool DoConvolveBackwardFilter(
+      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+      const DeviceMemory<double>& input_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<double> backward_output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemory<double>* backward_filter_data,
+      const dnn::AlgorithmDesc& algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory,
+      dnn::ProfileResult* output_profile_result) override;
+
+  bool DoConvolveBackwardFilter(
+      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+      const DeviceMemory<float>& input_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<float> backward_output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemory<float>* backward_filter_data,
+      const dnn::AlgorithmDesc& algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory,
+      dnn::ProfileResult* output_profile_result) override;
+
+  bool DoConvolveBackwardFilter(
+      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+      const DeviceMemory<Eigen::half>& input_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<Eigen::half> backward_output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemory<Eigen::half>* backward_filter_data,
+      const dnn::AlgorithmDesc& algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory,
       dnn::ProfileResult* output_profile_result) override;
 
   bool DoConvolveBackwardBias(
@@ -540,6 +655,14 @@ class CudnnSupport : public dnn::DnnSupport {
                      DeviceMemory<Eigen::half>* output_data,
                      ScratchAllocator* workspace_allocator) override;
 
+  bool DoPoolForward(Stream* stream,
+                     const dnn::PoolingDescriptor& pooling_dimensions,
+                     const dnn::BatchDescriptor& input_dimensions,
+                     const DeviceMemory<int8>& input_data,
+                     const dnn::BatchDescriptor& output_dimensions,
+                     DeviceMemory<int8>* output_data,
+                     ScratchAllocator* workspace_allocator) override;
+
   bool DoPoolBackward(Stream* stream,
                       const dnn::PoolingDescriptor& pooling_dimensions,
                       const dnn::BatchDescriptor& input_dimensions,
@@ -669,7 +792,7 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<U>* offset_backprop);
 
   template <class T>
-  port::Status DoConvolveImpl(
+  port::Status PrepareForConvolutionImpl(
       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
       const DeviceMemory<T>& input_data,
       const dnn::FilterDescriptor& filter_descriptor,
@@ -679,6 +802,19 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<T>* output_data, dnn::DataType accumulator_type,
       ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
+      dnn::AlgorithmDesc* algorithm_desc, DeviceMemory<uint8>* scratch_memory);
+
+  template <class T>
+  port::Status DoConvolveImpl(
+      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+      const DeviceMemory<T>& input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<T>& filter_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<T>* output_data, dnn::DataType accumulator_type,
+      const dnn::AlgorithmDesc& algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory,
       dnn::ProfileResult* output_profile_result);
 
   template <typename ElementType, typename BiasType, typename ScaleType>
@@ -699,7 +835,7 @@ class CudnnSupport : public dnn::DnnSupport {
       dnn::ProfileResult* output_profile_result);
 
   template <class T>
-  port::Status DoConvolveBackwardDataImpl(
+  port::Status PrepareForConvolutionBackwardDataImpl(
       Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
       const DeviceMemory<T>& filter_data,
       const dnn::BatchDescriptor& output_descriptor,
@@ -709,19 +845,45 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<T>* backward_input_data, dnn::DataType accumulator_type,
       ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
+      dnn::AlgorithmDesc* algorithm_desc, DeviceMemory<uint8>* scratch_memory);
+
+  template <class T>
+  port::Status DoConvolveBackwardDataImpl(
+      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<T>& filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<T> backward_output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& input_descriptor,
+      DeviceMemory<T>* backward_input_data, dnn::DataType accumulator_type,
+      const dnn::AlgorithmDesc& algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory,
       dnn::ProfileResult* output_profile_result);
 
   template <class T>
-  port::Status DoConvolveBackwardFilterImpl(
+  port::Status PrepareForConvolutionBackwardFilterImpl(
       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
       const DeviceMemory<T>& input_data,
-      const dnn::BatchDescriptor& output_descriptor,
+      const dnn::BatchDescriptor& output_descriptor_in,
       DeviceMemory<T> backward_output_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       const dnn::FilterDescriptor& filter_descriptor,
       DeviceMemory<T>* backward_filter_data, dnn::DataType accumulator_type,
       ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
+      dnn::AlgorithmDesc* algorithm_desc, DeviceMemory<uint8>* scratch_memory);
+
+  template <class T>
+  port::Status DoConvolveBackwardFilterImpl(
+      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+      const DeviceMemory<T>& input_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<T> backward_output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemory<T>* backward_filter_data, dnn::DataType accumulator_type,
+      const dnn::AlgorithmDesc& algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory,
       dnn::ProfileResult* output_profile_result);
 
   template <class T>
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index b34d1f722eaf60b21f2289a4b87b5653bfd43bb9..ca7a717bdb9403816140c5f15bacbad35287553b 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -431,7 +431,8 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
     *context = CreatedContexts::Add(new_context);
     CHECK(*context != nullptr)
         << "success in this call must entail non-null result";
-    VLOG(2) << "created or reused context " << context << " for this thread";
+    VLOG(2) << "created or reused context " << new_context
+            << " for this thread";
     return port::Status::OK();
   }
 
@@ -769,13 +770,13 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   ScopedActivateContext activated{context};
   CUresult res = cuStreamCreate(out, 0);
   if (res != CUDA_SUCCESS) {
-    LOG(ERROR) << "could not allocate CUDA stream for context " << context
-               << ": " << ToString(res);
+    LOG(ERROR) << "could not allocate CUDA stream for context "
+               << context->context() << ": " << ToString(res);
     return false;
   }
 
   VLOG(2) << "successfully created stream " << *out << " for context "
-          << context << " on thread";
+          << context->context() << " on thread";
   return true;
 }
 
@@ -788,11 +789,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   ScopedActivateContext activated{context};
   CUresult res = cuStreamDestroy(*stream);
   if (res != CUDA_SUCCESS) {
-    LOG(ERROR) << "failed to destroy CUDA stream for context " << context
-               << ": " << ToString(res);
+    LOG(ERROR) << "failed to destroy CUDA stream for context "
+               << context->context() << ": " << ToString(res);
   } else {
     VLOG(2) << "successfully destroyed stream " << *stream << " for context "
-            << context;
+            << context->context();
     *stream = nullptr;
   }
 }
@@ -809,8 +810,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     return nullptr;
   }
   void *ptr = reinterpret_cast<void *>(result);
-  VLOG(2) << "allocated " << ptr << " for context " << context << " of "
-          << bytes << " bytes";
+  VLOG(2) << "allocated " << ptr << " for context " << context->context()
+          << " of " << bytes << " bytes";
   return ptr;
 }
 
@@ -823,7 +824,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     LOG(ERROR) << "failed to free device memory at " << location
                << "; result: " << ToString(res);
   } else {
-    VLOG(2) << "deallocated " << location << " for context " << context;
+    VLOG(2) << "deallocated " << location << " for context "
+            << context->context();
   }
 }
 
@@ -839,8 +841,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     return nullptr;
   }
   void *ptr = reinterpret_cast<void *>(result);
-  VLOG(2) << "allocated " << ptr << " for context " << context << " of "
-          << bytes << " bytes in unified memory";
+  VLOG(2) << "allocated " << ptr << " for context " << context->context()
+          << " of " << bytes << " bytes in unified memory";
   return ptr;
 }
 
@@ -854,7 +856,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                << "; result: " << ToString(res);
   } else {
     VLOG(2) << "deallocated unified memory at " << location << " for context "
-            << context;
+            << context->context();
   }
 }
 
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.cc b/tensorflow/stream_executor/cuda/cuda_fft.cc
index acac7d6368885537b1f5727779388d550680e90d..776719bc35c3e43bf0e7dc38790640f0e8cc301f 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.cc
+++ b/tensorflow/stream_executor/cuda/cuda_fft.cc
@@ -23,14 +23,10 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/device_memory.h"
-
-#ifndef PLATFORM_GOOGLE
-#include "tensorflow/stream_executor/dso_loader.h"
-#endif
-
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.cc b/tensorflow/stream_executor/cuda/cuda_rng.cc
index 7f920719321637360fdf5c098e83dfaa49164e6c..022ee17ff7226a50326fe89ca77863177b28d0a5 100644
--- a/tensorflow/stream_executor/cuda/cuda_rng.cc
+++ b/tensorflow/stream_executor/cuda/cuda_rng.cc
@@ -21,17 +21,15 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/device_memory.h"
-
-#ifndef PLATFORM_GOOGLE
-#include "tensorflow/stream_executor/dso_loader.h"
-#endif
-
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/rng.h"
+// clang-format off
 #include "cuda/include/curand.h"
+// clang-format on
 
 // Formats curandStatus_t to output prettified values into a log stream.
 std::ostream &operator<<(std::ostream &in, const curandStatus_t &status) {
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index faa662211ebb366b8e20cdc3e33ca651c64cf73a..d91afaa638b2686ef6e39dc06ad61d3b31d377a8 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -368,6 +368,16 @@ BatchDescriptor BatchDescriptor::DepthConcatenateOutputDescriptor(
   return output;
 }
 
+TensorDescriptorProto BatchDescriptor::ToProto(DataType data_type) const {
+  CHECK_EQ(0.0, value_max_);
+  CHECK_EQ(0.0, value_min_);
+  CHECK(quantized_activation_mode_ == QuantizedActivationMode::k8Bit);
+
+  TensorDescriptorProto ret = tensor_;
+  ret.set_data_type(data_type);
+  return ret;
+}
+
 // -- FilterDescriptor
 
 FilterDescriptor::FilterDescriptor(int ndims) {
@@ -434,6 +444,12 @@ int64 FilterDescriptor::ComputeWeightCount() const {
   return ret;
 }
 
+TensorDescriptorProto FilterDescriptor::ToProto(DataType data_type) const {
+  TensorDescriptorProto ret = tensor_;
+  ret.set_data_type(data_type);
+  return ret;
+}
+
 // -- ConvolutionDescriptor
 
 ConvolutionDescriptor::ConvolutionDescriptor(int ndims) {
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 33ca0ff65ae457af2e397138d2a7c51f7c25634a..f5a77d652504dae8ae9e526364f03eae795cb220 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -248,6 +248,12 @@ class BatchDescriptor {
   string ToString() const;
   string ToShortString() const;
 
+  // Pre-condition:
+  //   value_max_ == 0
+  //   value_min_ == 0
+  //   quantized_activation_mode_ == QuantizedActivationMode::k8Bit
+  TensorDescriptorProto ToProto(DataType data_type) const;
+
   // Accessors.
   int64 count() const { return tensor_.dimensions(0); }
   int64 feature_map_count() const { return tensor_.dimensions(1); }
@@ -420,6 +426,7 @@ class FilterDescriptor {
 
   string ToString() const;
   string ToShortString() const;
+  TensorDescriptorProto ToProto(DataType data_type) const;
 
   // Returns the number of weights required as parameters for a convolution
   // using this filter descriptor.
@@ -509,6 +516,7 @@ class ConvolutionDescriptor {
 
   string ToString() const;
   string ToShortString() const;
+  ConvolutionDescriptorProto ToProto() const { return proto_; }
 
   ConvolutionDescriptor& set_zero_padding_height(int64 value) {
     SetDim(padding(), DimIndex::Y, value);
@@ -730,6 +738,7 @@ class PoolingDescriptor {
 class AlgorithmDesc {
  public:
   typedef int64 Index;
+  AlgorithmDesc() : AlgorithmDesc(0, false) {}
   AlgorithmDesc(Index a, bool use_tensor_ops) {
     proto_.set_algo_id(a);
     proto_.set_math_type(use_tensor_ops ? AlgorithmProto::TENSOR_OP_MATH
@@ -745,6 +754,8 @@ class AlgorithmDesc {
   }
   uint64 hash() const;
 
+  AlgorithmProto ToProto() const { return proto_; }
+
  private:
   AlgorithmProto proto_;
 };
@@ -1176,6 +1187,52 @@ class DnnSupport {
     return false;
   }
 
+  virtual bool PrepareForConvolution(
+      Stream* stream, const BatchDescriptor& batch_descriptor,
+      const DeviceMemory<float>& input_data,
+      const FilterDescriptor& filter_descriptor,
+      const DeviceMemory<float>& filter_data,
+      const ConvolutionDescriptor& convolution_descriptor,
+      const BatchDescriptor& output_descriptor,
+      DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::AlgorithmDesc* algorithm_desc, DeviceMemory<uint8>* scratch_memory) {
+    *algorithm_desc = {};
+    *scratch_memory = {};
+    return true;
+  }
+
+  virtual bool PrepareForConvolution(
+      Stream* stream, const BatchDescriptor& batch_descriptor,
+      const DeviceMemory<double>& input_data,
+      const FilterDescriptor& filter_descriptor,
+      const DeviceMemory<double>& filter_data,
+      const ConvolutionDescriptor& convolution_descriptor,
+      const BatchDescriptor& output_descriptor,
+      DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::AlgorithmDesc* algorithm_desc, DeviceMemory<uint8>* scratch_memory) {
+    *algorithm_desc = {};
+    *scratch_memory = {};
+    return true;
+  }
+
+  virtual bool PrepareForConvolution(
+      Stream* stream, const BatchDescriptor& batch_descriptor,
+      const DeviceMemory<Eigen::half>& input_data,
+      const FilterDescriptor& filter_descriptor,
+      const DeviceMemory<Eigen::half>& filter_data,
+      const ConvolutionDescriptor& convolution_descriptor,
+      const BatchDescriptor& output_descriptor,
+      DeviceMemory<Eigen::half>* output_data,
+      ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::AlgorithmDesc* algorithm_desc, DeviceMemory<uint8>* scratch_memory) {
+    *algorithm_desc = {};
+    *scratch_memory = {};
+    return true;
+  }
+
   // Enqueues a single-precision convolution operation onto the stream.
   //
   // Arguments (all borrowed):
@@ -1189,10 +1246,10 @@ class DnnSupport {
   //  output_descriptor: dimensions of the output layer.
   //  output_data: un-owned device memory region in which to place the
   //    convolution result.
-  //  scratch_allocator: un-owned, may-be-null object that may allocate scratch
-  //    space in order to speed up the convolution operation.
-  //  algorithm_config: specifies which algorithm should be used for the
+  //  algorithm_desc: specifies which algorithm should be used for the
   //    operation.
+  //  scratch: un-owned device memory for scratch space in order to speed up
+  //    the convolution operation.
   //  output_profile_result: the output profile result for this call. The
   //    profiling is only enabled when this is not nullptr.
   //
@@ -1217,8 +1274,9 @@ class DnnSupport {
       const DeviceMemory<float>& filter_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
+      DeviceMemory<float>* output_data,
+      const dnn::AlgorithmDesc& algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory,
       ProfileResult* output_profile_result) = 0;
 
   // Enqueues a double-precision convolution operation onto the stream.
@@ -1230,8 +1288,9 @@ class DnnSupport {
       const DeviceMemory<double>& filter_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
+      DeviceMemory<double>* output_data,
+      const dnn::AlgorithmDesc& algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory,
       dnn::ProfileResult* output_profile_result) = 0;
 
   // Enqueues a half-precision convolution operation onto the stream.
@@ -1244,8 +1303,8 @@ class DnnSupport {
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       const dnn::BatchDescriptor& output_descriptor,
       DeviceMemory<Eigen::half>* output_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
+      const dnn::AlgorithmDesc& algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory,
       ProfileResult* output_profile_result) = 0;
 
   // Return a list of algorithms supported by the forward convolution pass.
@@ -1301,6 +1360,54 @@ class DnnSupport {
       const BatchDescriptor& output_descriptor,
       DeviceMemory<float>* output_data) = 0;
 
+  virtual bool PrepareForConvolutionBackwardData(
+      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<float>& filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<float> backward_output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& input_descriptor,
+      DeviceMemory<float>* backward_input_data,
+      ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::AlgorithmDesc* algorithm_desc, DeviceMemory<uint8>* scratch_memory) {
+    *algorithm_desc = {};
+    *scratch_memory = {};
+    return true;
+  }
+
+  virtual bool PrepareForConvolutionBackwardData(
+      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<double>& filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<double> backward_output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& input_descriptor,
+      DeviceMemory<double>* backward_input_data,
+      ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::AlgorithmDesc* algorithm_desc, DeviceMemory<uint8>* scratch_memory) {
+    *algorithm_desc = {};
+    *scratch_memory = {};
+    return true;
+  }
+
+  virtual bool PrepareForConvolutionBackwardData(
+      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<Eigen::half>& filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<Eigen::half> backward_output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& input_descriptor,
+      DeviceMemory<Eigen::half>* backward_input_data,
+      ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::AlgorithmDesc* algorithm_desc, DeviceMemory<uint8>* scratch_memory) {
+    *algorithm_desc = {};
+    *scratch_memory = {};
+    return true;
+  }
+
   // Enqueues a single-precision backward convolution (for data) operation onto
   // the stream.
   //
@@ -1320,15 +1427,15 @@ class DnnSupport {
   //  scratch_allocator: un-owned, may-be-null object that may allocate scratch
   //    space in order to speed up the convolution operation.
   virtual bool DoConvolveBackwardData(
-      Stream* stream, const FilterDescriptor& filter_descriptor,
+      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
       const DeviceMemory<float>& filter_data,
-      const BatchDescriptor& output_descriptor,
+      const dnn::BatchDescriptor& output_descriptor,
       DeviceMemory<float> backward_output_data,
-      const ConvolutionDescriptor& convolution_descriptor,
-      const BatchDescriptor& input_descriptor,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& input_descriptor,
       DeviceMemory<float>* backward_input_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
+      const dnn::AlgorithmDesc& algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory,
       ProfileResult* output_profile_result) = 0;
 
   // Return a list of algorithms supported by the backward convolution pass for
@@ -1338,28 +1445,76 @@ class DnnSupport {
       std::vector<AlgorithmDesc>* out_algorithms);
 
   virtual bool DoConvolveBackwardData(
-      Stream* stream, const FilterDescriptor& filter_descriptor,
+      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
       const DeviceMemory<double>& filter_data,
-      const BatchDescriptor& output_descriptor,
+      const dnn::BatchDescriptor& output_descriptor,
       DeviceMemory<double> backward_output_data,
-      const ConvolutionDescriptor& convolution_descriptor,
-      const BatchDescriptor& input_descriptor,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& input_descriptor,
       DeviceMemory<double>* backward_input_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
+      const dnn::AlgorithmDesc& algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory,
       ProfileResult* output_profile_result) = 0;
 
   virtual bool DoConvolveBackwardData(
-      Stream* stream, const FilterDescriptor& filter_descriptor,
+      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
       const DeviceMemory<Eigen::half>& filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<Eigen::half> backward_output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& input_descriptor,
+      DeviceMemory<Eigen::half>* backward_input_data,
+      const dnn::AlgorithmDesc& algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory,
+      ProfileResult* output_profile_result) = 0;
+
+  virtual bool PrepareForConvolutionBackwardFilter(
+      Stream* stream, const BatchDescriptor& input_descriptor,
+      const DeviceMemory<float>& input_data,
+      const BatchDescriptor& output_descriptor,
+      DeviceMemory<float> backward_output_data,
+      const ConvolutionDescriptor& convolution_descriptor,
+      const FilterDescriptor& filter_descriptor,
+      DeviceMemory<float>* backward_filter_data,
+      ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::AlgorithmDesc* algorithm_desc, DeviceMemory<uint8>* scratch_memory) {
+    *algorithm_desc = {};
+    *scratch_memory = {};
+    return true;
+  }
+
+  virtual bool PrepareForConvolutionBackwardFilter(
+      Stream* stream, const BatchDescriptor& input_descriptor,
+      const DeviceMemory<double>& input_data,
+      const BatchDescriptor& output_descriptor,
+      DeviceMemory<double> backward_output_data,
+      const ConvolutionDescriptor& convolution_descriptor,
+      const FilterDescriptor& filter_descriptor,
+      DeviceMemory<double>* backward_filter_data,
+      ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::AlgorithmDesc* algorithm_desc, DeviceMemory<uint8>* scratch_memory) {
+    *algorithm_desc = {};
+    *scratch_memory = {};
+    return true;
+  }
+
+  virtual bool PrepareForConvolutionBackwardFilter(
+      Stream* stream, const BatchDescriptor& input_descriptor,
+      const DeviceMemory<Eigen::half>& input_data,
       const BatchDescriptor& output_descriptor,
       DeviceMemory<Eigen::half> backward_output_data,
       const ConvolutionDescriptor& convolution_descriptor,
-      const BatchDescriptor& input_descriptor,
-      DeviceMemory<Eigen::half>* backward_input_data,
+      const FilterDescriptor& filter_descriptor,
+      DeviceMemory<Eigen::half>* backward_filter_data,
       ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
-      ProfileResult* output_profile_result) = 0;
+      dnn::AlgorithmDesc* algorithm_desc, DeviceMemory<uint8>* scratch_memory) {
+    *algorithm_desc = {};
+    *scratch_memory = {};
+    return true;
+  }
 
   // Enqueues a single-precision backward convolution (for filter) operation
   // onto the stream.
@@ -1388,8 +1543,8 @@ class DnnSupport {
       const ConvolutionDescriptor& convolution_descriptor,
       const FilterDescriptor& filter_descriptor,
       DeviceMemory<float>* backward_filter_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
+      const dnn::AlgorithmDesc& algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory,
       ProfileResult* output_profile_result) = 0;
 
   // Return a list of algorithms supported by the backward convolution pass for
@@ -1406,8 +1561,8 @@ class DnnSupport {
       const ConvolutionDescriptor& convolution_descriptor,
       const FilterDescriptor& filter_descriptor,
       DeviceMemory<double>* backward_filter_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
+      const dnn::AlgorithmDesc& algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory,
       ProfileResult* output_profile_result) = 0;
 
   virtual bool DoConvolveBackwardFilter(
@@ -1418,8 +1573,8 @@ class DnnSupport {
       const ConvolutionDescriptor& convolution_descriptor,
       const FilterDescriptor& filter_descriptor,
       DeviceMemory<Eigen::half>* backward_filter_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
+      const dnn::AlgorithmDesc& algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory,
       ProfileResult* output_profile_result) = 0;
 
   // Enqueues a single-precision backward convolution (for bias) operation onto
@@ -1607,6 +1762,17 @@ class DnnSupport {
     return false;
   }
 
+  virtual bool DoPoolForward(Stream* stream,
+                             const dnn::PoolingDescriptor& pooling_dimensions,
+                             const dnn::BatchDescriptor& input_dimensions,
+                             const DeviceMemory<int8>& input_data,
+                             const dnn::BatchDescriptor& output_dimensions,
+                             DeviceMemory<int8>* output_data,
+                             ScratchAllocator* workspace_allocator) {
+    LOG(FATAL) << "DoPoolForward not implemented for int8.";
+    return false;
+  }
+
   // Performs differentiation of the pooling operation.
   virtual bool DoPoolBackward(Stream* stream,
                               const dnn::PoolingDescriptor& pooling_dimensions,
diff --git a/tensorflow/stream_executor/dnn.proto b/tensorflow/stream_executor/dnn.proto
index 56b079c3f5b962636e7c75b46449adca8e13a43e..11fb5d0f6a02a32fd3c958133136b078ac848ac3 100644
--- a/tensorflow/stream_executor/dnn.proto
+++ b/tensorflow/stream_executor/dnn.proto
@@ -66,6 +66,13 @@ enum ConvolutionMode {
   CONVOLUTION = 1;
 }
 
+enum ConvolutionKind {
+  INVALID = 0;
+  FORWARD = 1;
+  BACKWARD_FILTER = 2;
+  BACKWARD_DATA = 3;
+}
+
 // Generic tensor representation.
 message TensorDescriptorProto {
   repeated int64 dimensions = 1;
@@ -101,3 +108,22 @@ message ConvolutionDescriptorProto {
   int32 group_count = 5;
   ConvolutionMode convolution_mode = 6;
 }
+
+// A convolution. Currently it's only used for logging. In the future, we may
+// want to use it in the API as well.
+message ConvolutionProto {
+  ConvolutionKind kind = 1;
+  TensorDescriptorProto input = 2;
+  TensorDescriptorProto filter = 3;
+  TensorDescriptorProto output = 4;
+  AlgorithmProto algorithm = 5;
+  ConvolutionDescriptorProto conv_desc = 6;
+
+  // result = conv_scale * conv(...) + side_value_scale * side_value.
+  // side_value is an arbitrary buffer if activation is not none. Otherwise, it
+  // has to be the result buffer (using its old values).
+  double conv_scale = 7;
+  double side_value_scale = 8;
+
+  ActivationMode activation = 9;
+}
diff --git a/tensorflow/stream_executor/host/BUILD b/tensorflow/stream_executor/host/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..59472b14c14e55b3e72e307af2033a2720ad0b4d
--- /dev/null
+++ b/tensorflow/stream_executor/host/BUILD
@@ -0,0 +1,108 @@
+# Description:
+#   Host-platform specific StreamExecutor support code.
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/stream_executor:build_defs.bzl", "stream_executor_friends")
+
+package_group(
+    name = "friends",
+    packages = stream_executor_friends(),
+)
+
+package(default_visibility = [":friends"])
+
+# Filegroup used to collect source files for the dependency check.
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+cc_library(
+    name = "host_platform_id",
+    srcs = [
+        "host_platform_id.cc",
+    ],
+    hdrs = [
+        "host_platform_id.h",
+    ],
+    deps = [
+        "//tensorflow/stream_executor:platform",
+    ],
+)
+
+cc_library(
+    name = "host_platform",
+    srcs = [
+        "host_platform.cc",
+    ],
+    hdrs = [
+        "host_platform.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":host_gpu_executor",
+        ":host_platform_id",
+        "//tensorflow/stream_executor:executor_cache",
+        "//tensorflow/stream_executor:multi_platform_manager",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ],
+    alwayslink = True,  # Registers itself with the MultiPlatformManager.
+)
+
+cc_library(
+    name = "host_stream",
+    srcs = [
+        "host_stream.cc",
+    ],
+    hdrs = [
+        "host_stream.h",
+    ],
+    deps = [
+        "//tensorflow/stream_executor:kernel",
+        "//tensorflow/stream_executor/lib",
+    ],
+)
+
+cc_library(
+    name = "host_timer",
+    srcs = [
+        "host_timer.cc",
+    ],
+    hdrs = [
+        "host_timer.h",
+    ],
+    deps = [
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor:timer",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+# TODO(22689637): Rename this target.
+cc_library(
+    name = "host_gpu_executor",
+    srcs = [
+        "host_gpu_executor.cc",
+    ],
+    hdrs = [
+        "host_gpu_executor.h",
+    ],
+    deps = [
+        ":host_platform_id",
+        ":host_stream",
+        ":host_timer",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor:kernel",
+        "//tensorflow/stream_executor:rng",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor:timer",
+        "//tensorflow/stream_executor/lib",
+    ],
+    alwayslink = True,
+)
diff --git a/tensorflow/stream_executor/lib/BUILD b/tensorflow/stream_executor/lib/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..133ff2b161b9db227a6a4921865f56bfc4b9bece
--- /dev/null
+++ b/tensorflow/stream_executor/lib/BUILD
@@ -0,0 +1,62 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/stream_executor:build_defs.bzl", "stream_executor_friends")
+
+package_group(
+    name = "friends",
+    packages = stream_executor_friends(),
+)
+
+package(default_visibility = [":friends"])
+
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+cc_library(
+    name = "lib",
+    srcs = glob(
+        [
+            "**/*.cc",
+        ],
+        exclude = [
+            "**/*test*",
+        ],
+    ),
+    hdrs = glob(["**/*.h"]),
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:ptr_util",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cc_test(
+    name = "statusor_test",
+    size = "small",
+    srcs = ["statusor_test.cc"],
+    deps = [
+        ":lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "utility_headers",
+    hdrs = [
+        "ptr_util.h",
+    ],
+    deps = [
+        "//tensorflow/core:ptr_util",
+    ],
+)
diff --git a/tensorflow/stream_executor/lib/initialize.h b/tensorflow/stream_executor/lib/initialize.h
index 688b0214694478e9be1b1d14e58fda94367f547b..cd0b9dad19bf1d0e4e07bc153d94664fda12bd98 100644
--- a/tensorflow/stream_executor/lib/initialize.h
+++ b/tensorflow/stream_executor/lib/initialize.h
@@ -16,55 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_INITIALIZE_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_INITIALIZE_H_
 
-#include "tensorflow/stream_executor/platform/port.h"
-
-#if defined(PLATFORM_GOOGLE)
-#include "tensorflow/stream_executor/platform/google/initialize.h"
-#else
-
-#undef REGISTER_MODULE_INITIALIZER
-#undef DECLARE_MODULE_INITIALIZER
-#undef REGISTER_MODULE_INITIALIZER_SEQUENCE
-
-namespace stream_executor {
-namespace port {
-
-class Initializer {
- public:
-  typedef void (*InitializerFunc)();
-  explicit Initializer(InitializerFunc func) { func(); }
-
-  struct Dependency {
-    Dependency(const char *n, Initializer *i) : name(n), initializer(i) {}
-    const char *const name;
-    Initializer *const initializer;
-  };
-
-  struct DependencyRegisterer {
-    DependencyRegisterer(const char *type, const char *name,
-                         Initializer *initializer,
-                         const Dependency &dependency);
-  };
-};
-
-}  // namespace port
-}  // namespace stream_executor
-
-#define REGISTER_INITIALIZER(type, name, body)                             \
-  static void google_init_##type##_##name() { body; }                      \
-  ::stream_executor::port::Initializer google_initializer_##type##_##name( \
-      google_init_##type##_##name)
-
-#define REGISTER_MODULE_INITIALIZER(name, body) \
-  REGISTER_INITIALIZER(module, name, body)
-
-#define DECLARE_INITIALIZER(type, name) \
-  extern ::stream_executor::port::Initializer google_initializer_##type##_##name
-
-#define DECLARE_MODULE_INITIALIZER(name) DECLARE_INITIALIZER(module, name)
-
-#define REGISTER_MODULE_INITIALIZER_SEQUENCE(name1, name2)
-
-#endif  // !defined(PLATFORM_GOOGLE)
+#include "tensorflow/stream_executor/platform/initialize.h"
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_INITIALIZE_H_
diff --git a/tensorflow/stream_executor/logging.proto b/tensorflow/stream_executor/logging.proto
index 2c75500cda452f787cb174238058f026a31e4242..68021d2b3157ceeaabd0d0a2065bc946913f64c4 100644
--- a/tensorflow/stream_executor/logging.proto
+++ b/tensorflow/stream_executor/logging.proto
@@ -2,6 +2,8 @@ syntax = "proto3";
 
 package stream_executor;
 
+import "tensorflow/stream_executor/dnn.proto";
+
 message CudnnVersion {
   int32 major = 1;
   int32 minor = 2;
@@ -17,3 +19,11 @@ message CudaInfo {
   CudnnVersion cudnn_version = 1;
   ComputeCapability compute_capability = 2;
 }
+
+message ConvLogEntry {
+  CudaInfo cuda_info = 1;
+  dnn.ConvolutionProto convolution = 2;
+
+  // Profiled time in ms. 0.0 if the convolution is not profiled.
+  float profile_time_ms = 3;
+}
diff --git a/tensorflow/stream_executor/platform/BUILD b/tensorflow/stream_executor/platform/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..702b2cdfe0dd41997f99daf1bcdcbf8a6994edd8
--- /dev/null
+++ b/tensorflow/stream_executor/platform/BUILD
@@ -0,0 +1,47 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/stream_executor:build_defs.bzl", "stream_executor_friends")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_platform_hdrs")
+
+package_group(
+    name = "friends",
+    packages = stream_executor_friends(),
+)
+
+package(
+    default_visibility = [":friends"],
+)
+
+cc_library(
+    name = "platform",
+    textual_hdrs = [
+        "logging.h",
+        "mutex.h",
+        "platform.h",
+        "port.h",
+        "thread_annotations.h",
+        "initialize.h",
+    ],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/platform/default:platform",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "dso_loader",
+    hdrs = ["dso_loader.h"],
+    deps = [
+        ":platform",
+        "//tensorflow/stream_executor/platform/default:dso_loader",
+    ],
+)
+
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
diff --git a/tensorflow/stream_executor/platform/default/BUILD b/tensorflow/stream_executor/platform/default/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f1ae7d86ff78a50da51ef730098cee2fc9e30aad
--- /dev/null
+++ b/tensorflow/stream_executor/platform/default/BUILD
@@ -0,0 +1,25 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow/stream_executor:__subpackages__"])
+
+cc_library(
+    name = "platform",
+    textual_hdrs = [
+        "initialize.h",
+        "mutex.h",
+    ],
+    deps = ["//tensorflow/core:lib"],
+)
+
+cc_library(
+    name = "dso_loader",
+    srcs = ["dso_loader.cc"],
+    hdrs = ["dso_loader.h"],
+    deps = [
+        "//tensorflow/stream_executor:platform",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
diff --git a/tensorflow/stream_executor/dso_loader.cc b/tensorflow/stream_executor/platform/default/dso_loader.cc
similarity index 96%
rename from tensorflow/stream_executor/dso_loader.cc
rename to tensorflow/stream_executor/platform/default/dso_loader.cc
index 6dda5d63155d8f9cf8d068b3feae51b1fba88a51..668eeee3f31ff257092674de98c7d20c39c46a73 100644
--- a/tensorflow/stream_executor/dso_loader.cc
+++ b/tensorflow/stream_executor/platform/default/dso_loader.cc
@@ -16,8 +16,6 @@ limitations under the License.
 // TODO(jhen): Replace hardcoded, platform specific path strings in GetXXXPath()
 // with a function in e.g. cuda.h.
 
-#include "tensorflow/stream_executor/dso_loader.h"
-
 #include <limits.h>
 #include <stdlib.h>
 #include <initializer_list>
@@ -30,6 +28,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/path.h"
 #include "tensorflow/stream_executor/lib/str_util.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/platform/default/dso_loader.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
@@ -89,10 +88,13 @@ string GetCudnnVersion() { return TF_CUDNN_VERSION; }
 #if defined(__APPLE__)
   // On Mac OS X, CUDA sometimes installs libcuda.dylib instead of
   // libcuda.1.dylib.
-  return status.ok() ? status : GetDsoHandle(
-     FindDsoPath(port::Env::Default()->FormatLibraryFileName("cuda", ""),
-                 GetCudaDriverLibraryPath()),
-     dso_handle);
+  return status.ok()
+             ? status
+             : GetDsoHandle(
+                   FindDsoPath(
+                       port::Env::Default()->FormatLibraryFileName("cuda", ""),
+                       GetCudaDriverLibraryPath()),
+                   dso_handle);
 #else
   return status;
 #endif
@@ -144,7 +146,7 @@ static mutex& GetRpathMutex() {
               << ". LD_LIBRARY_PATH: "
               << (ld_library_path != nullptr ? ld_library_path : "")
 #endif
-    ;
+        ;
     return port::Status(port::error::FAILED_PRECONDITION,
                         absl::StrCat("could not dlopen DSO: ", path,
                                      "; dlerror: ", s.error_message()));
diff --git a/tensorflow/stream_executor/dso_loader.h b/tensorflow/stream_executor/platform/default/dso_loader.h
similarity index 100%
rename from tensorflow/stream_executor/dso_loader.h
rename to tensorflow/stream_executor/platform/default/dso_loader.h
index f063b68d6058f7b1faecfd83d3d21b899cf027a3..806f65b24cdc209dd14a727de6a724bcd1705075 100644
--- a/tensorflow/stream_executor/dso_loader.h
+++ b/tensorflow/stream_executor/platform/default/dso_loader.h
@@ -19,8 +19,8 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_DSO_LOADER_H_
 #define TENSORFLOW_STREAM_EXECUTOR_DSO_LOADER_H_
 
-#include "tensorflow/stream_executor/platform/port.h"
 #include <vector>
+#include "tensorflow/stream_executor/platform/port.h"
 
 #include "absl/strings/string_view.h"
 #include "tensorflow/stream_executor/lib/status.h"
diff --git a/tensorflow/stream_executor/platform/default/initialize.h b/tensorflow/stream_executor/platform/default/initialize.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d27c85336e1ca64ebcc6969f2179399529e8b37
--- /dev/null
+++ b/tensorflow/stream_executor/platform/default/initialize.h
@@ -0,0 +1,62 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_INITIALIZE_H_
+#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_INITIALIZE_H_
+
+#undef REGISTER_MODULE_INITIALIZER
+#undef DECLARE_MODULE_INITIALIZER
+#undef REGISTER_MODULE_INITIALIZER_SEQUENCE
+
+namespace stream_executor {
+namespace port {
+
+class Initializer {
+ public:
+  typedef void (*InitializerFunc)();
+  explicit Initializer(InitializerFunc func) { func(); }
+
+  struct Dependency {
+    Dependency(const char *n, Initializer *i) : name(n), initializer(i) {}
+    const char *const name;
+    Initializer *const initializer;
+  };
+
+  struct DependencyRegisterer {
+    DependencyRegisterer(const char *type, const char *name,
+                         Initializer *initializer,
+                         const Dependency &dependency);
+  };
+};
+
+}  // namespace port
+}  // namespace stream_executor
+
+#define REGISTER_INITIALIZER(type, name, body)                             \
+  static void google_init_##type##_##name() { body; }                      \
+  ::stream_executor::port::Initializer google_initializer_##type##_##name( \
+      google_init_##type##_##name)
+
+#define REGISTER_MODULE_INITIALIZER(name, body) \
+  REGISTER_INITIALIZER(module, name, body)
+
+#define DECLARE_INITIALIZER(type, name) \
+  extern ::stream_executor::port::Initializer google_initializer_##type##_##name
+
+#define DECLARE_MODULE_INITIALIZER(name) DECLARE_INITIALIZER(module, name)
+
+#define REGISTER_MODULE_INITIALIZER_SEQUENCE(name1, name2)
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_INITIALIZE_H_
diff --git a/tensorflow/stream_executor/platform/default/mutex.h b/tensorflow/stream_executor/platform/default/mutex.h
index c9f5a7c609e5bbe59ea456e30d575b991aa37b65..2f8f0636ba7bd037f356525047f2dd7c0eda789d 100644
--- a/tensorflow/stream_executor/platform/default/mutex.h
+++ b/tensorflow/stream_executor/platform/default/mutex.h
@@ -16,7 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_MUTEX_H_
 #define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_MUTEX_H_
 
-#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace stream_executor {
 
diff --git a/tensorflow/stream_executor/platform/dso_loader.h b/tensorflow/stream_executor/platform/dso_loader.h
new file mode 100644
index 0000000000000000000000000000000000000000..1dd56684b1917b07ba6e421479b14ac22af5d335
--- /dev/null
+++ b/tensorflow/stream_executor/platform/dso_loader.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DSO_LOADER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DSO_LOADER_H_
+
+#include "tensorflow/stream_executor/platform/platform.h"
+
+// Include appropriate platform-dependent implementations
+#if defined(PLATFORM_GOOGLE)
+#include "tensorflow/stream_executor/platform/google/dso_loader.h"
+#elif defined(PLATFORM_POSIX) || defined(PLATFORM_POSIX_ANDROID) || \
+    defined(PLATFORM_GOOGLE_ANDROID)
+#include "tensorflow/stream_executor/platform/default/dso_loader.h"
+#else
+#error Define the appropriate PLATFORM_<foo> macro for this platform
+#endif
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DSO_LOADER_H_
diff --git a/tensorflow/stream_executor/platform/initialize.h b/tensorflow/stream_executor/platform/initialize.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb13132afff7c9f6d4c57176eef8d7180bb45a93
--- /dev/null
+++ b/tensorflow/stream_executor/platform/initialize.h
@@ -0,0 +1,27 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_INITIALIZE_H_
+#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_INITIALIZE_H_
+
+#include "tensorflow/stream_executor/platform/platform.h"
+
+#if defined(PLATFORM_GOOGLE)
+#include "tensorflow/stream_executor/platform/google/initialize.h"
+#else
+#include "tensorflow/stream_executor/platform/default/initialize.h"
+#endif
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_INITIALIZE_H_
diff --git a/tensorflow/stream_executor/platform/mutex.h b/tensorflow/stream_executor/platform/mutex.h
index 28828951de521752e8debfc1b6cfd2de73a09828..fa6c8c017c30b66baf07e1ee19f4326d7c01b9c3 100644
--- a/tensorflow/stream_executor/platform/mutex.h
+++ b/tensorflow/stream_executor/platform/mutex.h
@@ -16,8 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_MUTEX_H_
 #define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_MUTEX_H_
 
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/platform/platform.h"
 
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/stream_executor/platform/google/mutex.h"
diff --git a/tensorflow/stream_executor/platform/platform.h b/tensorflow/stream_executor/platform/platform.h
new file mode 100644
index 0000000000000000000000000000000000000000..5bf0e120d39f8bfa8e1a62ae3749beac076335c6
--- /dev/null
+++ b/tensorflow/stream_executor/platform/platform.h
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PLATFORM_H_
+#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PLATFORM_H_
+
+#if !defined(PLATFORM_POSIX) && !defined(PLATFORM_GOOGLE) && \
+    !defined(PLATFORM_POSIX_ANDROID) && !defined(PLATFORM_GOOGLE_ANDROID)
+
+// Choose which platform we are on.
+#if defined(ANDROID) || defined(__ANDROID__)
+#define PLATFORM_POSIX_ANDROID
+
+#elif defined(__APPLE__)
+#define PLATFORM_POSIX
+
+#else
+// If no platform specified, use:
+#define PLATFORM_POSIX
+
+#endif
+#endif
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PLATFORM_H_
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 3edc66cde8045d7f6ae53095e8136d1697fb1d23..1befc18e1951d2742c62d44132f338cd9f392085 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -549,11 +549,16 @@ Stream &Stream::ThenConvolveWithScratch(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoConvolve(
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      CheckError(dnn->PrepareForConvolution(
           this, input_descriptor, input_data, filter_descriptor, filter_data,
           convolution_descriptor, output_descriptor, output, scratch_allocator,
-          dnn::AlgorithmConfig(),
-          /*output_profile_result=*/nullptr));
+          dnn::AlgorithmConfig(), &algorithm_desc, &scratch_memory));
+      CheckError(dnn->DoConvolve(
+          this, input_descriptor, input_data, filter_descriptor, filter_data,
+          convolution_descriptor, output_descriptor, output, algorithm_desc,
+          &scratch_memory, nullptr));
     } else {
       SetErrorAndLogNoDnnSupport();
     }
@@ -576,11 +581,16 @@ Stream &Stream::ThenConvolveWithScratch(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoConvolve(
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      CheckError(dnn->PrepareForConvolution(
           this, input_descriptor, input_data, filter_descriptor, filter_data,
           convolution_descriptor, output_descriptor, output, scratch_allocator,
-          dnn::AlgorithmConfig(),
-          /*output_profile_result=*/nullptr));
+          dnn::AlgorithmConfig(), &algorithm_desc, &scratch_memory));
+      CheckError(dnn->DoConvolve(
+          this, input_descriptor, input_data, filter_descriptor, filter_data,
+          convolution_descriptor, output_descriptor, output, algorithm_desc,
+          &scratch_memory, nullptr));
     } else {
       SetErrorAndLogNoDnnSupport();
     }
@@ -758,10 +768,18 @@ Stream &Stream::ThenConvolveWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolve(
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status = dnn->PrepareForConvolution(
           this, input_descriptor, input_data, filter_descriptor, filter_data,
           convolution_descriptor, output_descriptor, output, scratch_allocator,
-          algorithm_config, output_profile_result);
+          algorithm_config, &algorithm_desc, &scratch_memory);
+      if (status) {
+        status = dnn->DoConvolve(
+            this, input_descriptor, input_data, filter_descriptor, filter_data,
+            convolution_descriptor, output_descriptor, output, algorithm_desc,
+            &scratch_memory, output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -789,10 +807,18 @@ Stream &Stream::ThenConvolveWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolve(
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status = dnn->PrepareForConvolution(
           this, input_descriptor, input_data, filter_descriptor, filter_data,
           convolution_descriptor, output_descriptor, output, scratch_allocator,
-          algorithm_config, output_profile_result);
+          algorithm_config, &algorithm_desc, &scratch_memory);
+      if (status) {
+        status = dnn->DoConvolve(
+            this, input_descriptor, input_data, filter_descriptor, filter_data,
+            convolution_descriptor, output_descriptor, output, algorithm_desc,
+            &scratch_memory, output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -820,10 +846,18 @@ Stream &Stream::ThenConvolveWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolve(
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status = dnn->PrepareForConvolution(
           this, input_descriptor, input_data, filter_descriptor, filter_data,
           convolution_descriptor, output_descriptor, output, scratch_allocator,
-          algorithm_config, output_profile_result);
+          algorithm_config, &algorithm_desc, &scratch_memory);
+      if (status) {
+        status = dnn->DoConvolve(
+            this, input_descriptor, input_data, filter_descriptor, filter_data,
+            convolution_descriptor, output_descriptor, output, algorithm_desc,
+            &scratch_memory, output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -969,10 +1003,17 @@ Stream &Stream::ThenConvolveBackwardDataWithScratch(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoConvolveBackwardData(
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      CheckError(dnn->PrepareForConvolutionBackwardData(
           this, filter_descriptor, filter_data, output_descriptor,
           backward_output_data, convolution_descriptor, input_descriptor,
           backward_input_data, scratch_allocator, dnn::AlgorithmConfig(),
+          &algorithm_desc, &scratch_memory));
+      CheckError(dnn->DoConvolveBackwardData(
+          this, filter_descriptor, filter_data, output_descriptor,
+          backward_output_data, convolution_descriptor, input_descriptor,
+          backward_input_data, algorithm_desc, &scratch_memory,
           /*output_profile_result=*/nullptr));
     } else {
       SetErrorAndLogNoDnnSupport();
@@ -999,11 +1040,20 @@ Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolveBackwardData(
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status = dnn->PrepareForConvolutionBackwardData(
           this, filter_descriptor, filter_data, output_descriptor,
           backward_output_data, convolution_descriptor, input_descriptor,
           backward_input_data, scratch_allocator, algorithm_config,
-          output_profile_result);
+          &algorithm_desc, &scratch_memory);
+      if (status) {
+        status = dnn->DoConvolveBackwardData(
+            this, filter_descriptor, filter_data, output_descriptor,
+            backward_output_data, convolution_descriptor, input_descriptor,
+            backward_input_data, algorithm_desc, &scratch_memory,
+            output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -1032,11 +1082,20 @@ Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolveBackwardData(
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status = dnn->PrepareForConvolutionBackwardData(
           this, filter_descriptor, filter_data, output_descriptor,
           backward_output_data, convolution_descriptor, input_descriptor,
           backward_input_data, scratch_allocator, algorithm_config,
-          output_profile_result);
+          &algorithm_desc, &scratch_memory);
+      if (status) {
+        status = dnn->DoConvolveBackwardData(
+            this, filter_descriptor, filter_data, output_descriptor,
+            backward_output_data, convolution_descriptor, input_descriptor,
+            backward_input_data, algorithm_desc, &scratch_memory,
+            output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -1065,11 +1124,20 @@ Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolveBackwardData(
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status = dnn->PrepareForConvolutionBackwardData(
           this, filter_descriptor, filter_data, output_descriptor,
           backward_output_data, convolution_descriptor, input_descriptor,
           backward_input_data, scratch_allocator, algorithm_config,
-          output_profile_result);
+          &algorithm_desc, &scratch_memory);
+      if (status) {
+        status = dnn->DoConvolveBackwardData(
+            this, filter_descriptor, filter_data, output_descriptor,
+            backward_output_data, convolution_descriptor, input_descriptor,
+            backward_input_data, algorithm_desc, &scratch_memory,
+            output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -1096,10 +1164,17 @@ Stream &Stream::ThenConvolveBackwardDataWithScratch(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoConvolveBackwardData(
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      CheckError(dnn->PrepareForConvolutionBackwardData(
           this, filter_descriptor, filter_data, output_descriptor,
           backward_output_data, convolution_descriptor, input_descriptor,
           backward_input_data, scratch_allocator, dnn::AlgorithmConfig(),
+          &algorithm_desc, &scratch_memory));
+      CheckError(dnn->DoConvolveBackwardData(
+          this, filter_descriptor, filter_data, output_descriptor,
+          backward_output_data, convolution_descriptor, input_descriptor,
+          backward_input_data, algorithm_desc, &scratch_memory,
           /*output_profile_result=*/nullptr));
     } else {
       SetErrorAndLogNoDnnSupport();
@@ -1138,10 +1213,17 @@ Stream &Stream::ThenConvolveBackwardFilterWithScratch(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoConvolveBackwardFilter(
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      CheckError(dnn->PrepareForConvolutionBackwardFilter(
           this, input_descriptor, input_data, output_descriptor,
           backward_output_data, convolution_descriptor, filter_descriptor,
           backward_filter_data, scratch_allocator, dnn::AlgorithmConfig(),
+          &algorithm_desc, &scratch_memory));
+      CheckError(dnn->DoConvolveBackwardFilter(
+          this, input_descriptor, input_data, output_descriptor,
+          backward_output_data, convolution_descriptor, filter_descriptor,
+          backward_filter_data, algorithm_desc, &scratch_memory,
           /*output_profile_result=*/nullptr));
     } else {
       SetErrorAndLogNoDnnSupport();
@@ -1168,11 +1250,20 @@ Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolveBackwardFilter(
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status = dnn->PrepareForConvolutionBackwardFilter(
           this, input_descriptor, input_data, output_descriptor,
           backward_output_data, convolution_descriptor, filter_descriptor,
           backward_filter_data, scratch_allocator, algorithm_config,
-          output_profile_result);
+          &algorithm_desc, &scratch_memory);
+      if (status) {
+        status = dnn->DoConvolveBackwardFilter(
+            this, input_descriptor, input_data, output_descriptor,
+            backward_output_data, convolution_descriptor, filter_descriptor,
+            backward_filter_data, algorithm_desc, &scratch_memory,
+            output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -1201,11 +1292,20 @@ Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolveBackwardFilter(
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status = dnn->PrepareForConvolutionBackwardFilter(
           this, input_descriptor, input_data, output_descriptor,
           backward_output_data, convolution_descriptor, filter_descriptor,
           backward_filter_data, scratch_allocator, algorithm_config,
-          output_profile_result);
+          &algorithm_desc, &scratch_memory);
+      if (status) {
+        status = dnn->DoConvolveBackwardFilter(
+            this, input_descriptor, input_data, output_descriptor,
+            backward_output_data, convolution_descriptor, filter_descriptor,
+            backward_filter_data, algorithm_desc, &scratch_memory,
+            output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -1232,10 +1332,17 @@ Stream &Stream::ThenConvolveBackwardFilterWithScratch(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoConvolveBackwardFilter(
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      CheckError(dnn->PrepareForConvolutionBackwardFilter(
           this, input_descriptor, input_data, output_descriptor,
           backward_output_data, convolution_descriptor, filter_descriptor,
           backward_filter_data, scratch_allocator, dnn::AlgorithmConfig(),
+          &algorithm_desc, &scratch_memory));
+      CheckError(dnn->DoConvolveBackwardFilter(
+          this, input_descriptor, input_data, output_descriptor,
+          backward_output_data, convolution_descriptor, filter_descriptor,
+          backward_filter_data, algorithm_desc, &scratch_memory,
           /*output_profile_result=*/nullptr));
     } else {
       SetErrorAndLogNoDnnSupport();
@@ -1262,11 +1369,20 @@ Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolveBackwardFilter(
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status = dnn->PrepareForConvolutionBackwardFilter(
           this, input_descriptor, input_data, output_descriptor,
           backward_output_data, convolution_descriptor, filter_descriptor,
           backward_filter_data, scratch_allocator, algorithm_config,
-          output_profile_result);
+          &algorithm_desc, &scratch_memory);
+      if (status) {
+        status = dnn->DoConvolveBackwardFilter(
+            this, input_descriptor, input_data, output_descriptor,
+            backward_output_data, convolution_descriptor, filter_descriptor,
+            backward_filter_data, algorithm_desc, &scratch_memory,
+            output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -1490,6 +1606,28 @@ Stream &Stream::ThenPoolForward(
   return *this;
 }
 
+Stream &Stream::ThenPoolForward(
+    const dnn::PoolingDescriptor &pooling_dimensions,
+    const dnn::BatchDescriptor &input_dimensions,
+    const DeviceMemory<int8> &input_data,
+    const dnn::BatchDescriptor &output_dimensions,
+    DeviceMemory<int8> *output_data, ScratchAllocator *workspace_allocator) {
+  VLOG_CALL(PARAM(pooling_dimensions), PARAM(input_dimensions),
+            PARAM(input_data), PARAM(output_dimensions), PARAM(output_data),
+            PARAM(workspace_allocator));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions,
+                                    input_data, output_dimensions, output_data,
+                                    workspace_allocator));
+    } else {
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
 Stream &Stream::ThenPoolBackward(
     const dnn::PoolingDescriptor &pooling_dimensions,
     const dnn::BatchDescriptor &input_dimensions,
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index 0fc90cf83d6b4e3e0ede84747f8149c1a25289ca..d5e2fdf58df7a29f059a9495f2d1271d3949b8a5 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -650,6 +650,13 @@ class Stream {
                           DeviceMemory<Eigen::half> *output_data,
                           ScratchAllocator *workspace_allocator = nullptr);
 
+  Stream &ThenPoolForward(const dnn::PoolingDescriptor &pooling_dimensions,
+                          const dnn::BatchDescriptor &input_dimensions,
+                          const DeviceMemory<int8> &input_data,
+                          const dnn::BatchDescriptor &output_dimensions,
+                          DeviceMemory<int8> *output_data,
+                          ScratchAllocator *workspace_allocator = nullptr);
+
   Stream &ThenPoolBackward(const dnn::PoolingDescriptor &pooling_dimensions,
                            const dnn::BatchDescriptor &input_dimensions,
                            const DeviceMemory<double> &input_data,
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index cb67a906a824574403d7aeb5a195f9e44a5d0426..439c73ec8f61388cd3d02283bd1724cdf69b04e4 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include <utility>
 
 #include "absl/strings/str_cat.h"
-#include "tensorflow/core/platform/logger.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/blas.h"
 #include "tensorflow/stream_executor/fft.h"
@@ -34,7 +33,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/str_util.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/lib/threadpool.h"
-#include "tensorflow/stream_executor/logging.pb.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/rng.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
@@ -221,31 +219,7 @@ StreamExecutor::~StreamExecutor() {
 port::Status StreamExecutor::Init(int device_ordinal,
                                   DeviceOptions device_options) {
   device_ordinal_ = device_ordinal;
-  TF_RETURN_IF_ERROR(
-      implementation_->Init(device_ordinal, std::move(device_options)));
-
-  if (platform_kind_ == PlatformKind::kCuda) {
-    CudaInfo info;
-
-    int cc_major, cc_minor;
-    GetDeviceDescription().cuda_compute_capability(&cc_major, &cc_minor);
-    info.mutable_compute_capability()->set_major(cc_major);
-    info.mutable_compute_capability()->set_minor(cc_minor);
-
-    if (auto *dnn = AsDnn()) {
-      port::StatusOr<dnn::VersionInfo> version_or = dnn->GetVersion();
-      if (version_or.ok()) {
-        const auto &version = version_or.ValueOrDie();
-        info.mutable_cudnn_version()->set_major(version.major_version());
-        info.mutable_cudnn_version()->set_minor(version.minor_version());
-        info.mutable_cudnn_version()->set_patch(version.patch());
-      }
-    }
-
-    tensorflow::Logger::Singleton()->LogProto(info);
-  }
-
-  return port::Status::OK();
+  return implementation_->Init(device_ordinal, std::move(device_options));
 }
 
 port::Status StreamExecutor::Init() {
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index fbdfa2cf6dd2b8da4be7db944b863e4ef06ad5f3..1024b686ebd435beff934f2c4d914fba80a70b41 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1793,6 +1793,7 @@ def tf_py_test(
         tags = [],
         shard_count = 1,
         additional_deps = [],
+        additional_visibility = [],
         kernels = [],
         flaky = 0,
         xla_enabled = False,
@@ -1813,7 +1814,7 @@ def tf_py_test(
         shard_count = shard_count,
         srcs_version = "PY2AND3",
         tags = tags,
-        visibility = [clean_dep("//tensorflow:internal")],
+        visibility = [clean_dep("//tensorflow:internal")] + additional_visibility,
         deps = [
             clean_dep("//tensorflow/python:extra_py_tests_deps"),
             clean_dep("//tensorflow/python:gradient_checker"),
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
index 2d115904925eb96164484300baf628d41d3fcff4..a262c0f799634470090eeba90f480f94ac671f87 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
@@ -86,7 +86,7 @@ tf_module {
   }
   member_method {
     name: "bucket_by_sequence_length"
-    argspec: "args=[\'element_length_func\', \'bucket_boundaries\', \'bucket_batch_sizes\', \'padded_shapes\', \'padding_values\', \'pad_to_bucket_boundary\', \'no_padding\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'element_length_func\', \'bucket_boundaries\', \'bucket_batch_sizes\', \'padded_shapes\', \'padding_values\', \'pad_to_bucket_boundary\', \'no_padding\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "cardinality"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
index b06c73d12602b25426034f801be329fb88067011..9c29067b6d83a1e753f01ca62e89bfef559fc824 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "experimental_initialize"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
+  }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt
index df707e8920e4488ed6b40a7f93f56b5624188c84..22f8160c964de394a7cc4a51d1d17218a0da8b56 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt
@@ -26,6 +26,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'strategy\', \'replica_id_in_sync_group\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "all_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_call"
     argspec: "args=[\'self\', \'merge_fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt
index 77706e57133e1186d9e98fcf9205ed4c91772eda..37b620891fbf84d33919365aa3efbcd1055ac762 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "colocate_vars_with"
     argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_make_numpy_dataset"
+    argspec: "args=[\'self\', \'numpy_input\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "experimental_run_steps_on_iterator"
     argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
index 9a1df5514261a47aae6f3d11be78b5a6fa6da919..4aa6f1c4e144366d194535d107a32f888caf8a54 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
@@ -74,6 +74,10 @@ tf_class {
     name: "experimental_initialize"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
+  }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.-g-file.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.-g-file.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c6bf57a88fc1295da13e0b58671191c9d8ba8caa
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.-g-file.pbtxt
@@ -0,0 +1,58 @@
+path: "tensorflow.io.gfile.GFile"
+tf_class {
+  is_instance: "<class \'tensorflow.python.platform.gfile.GFile\'>"
+  is_instance: "<class \'tensorflow.python.lib.io.file_io.FileIO\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "mode"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flush"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "next"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'n\'], varargs=None, keywords=None, defaults=[\'-1\'], "
+  }
+  member_method {
+    name: "readline"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "readlines"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "seek"
+    argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "tell"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "write"
+    argspec: "args=[\'self\', \'file_content\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.pbtxt
index cfa3372b12bfe32eed4311c89b6448c0359c0913..a797c06ff337cffe503d89c09497996ea64c6ad2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.io.gfile"
 tf_module {
+  member {
+    name: "GFile"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "copy"
     argspec: "args=[\'src\', \'dst\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ba9e57bed4100437c8b71d8b506cc2c928a9ac9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-hinge.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.CategoricalHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CategoricalHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-cosine-proximity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-cosine-proximity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4952a76291c00bfdd73eed5412e7421887d1bab2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-cosine-proximity.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.CosineProximity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CosineProximity\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7b3c62d3bef0b9d200577f34cbe303fc7a094acc
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-hinge.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.Hinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Hinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-squared-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f8badb2b6ec2ba8dd16136c32f5d27811a0d4d9d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-squared-hinge.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.SquaredHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.SquaredHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
index 9e26ddbdca0c45df195dd566952379887dcfcff3..b71e89883e434b2e80708d37910871d1599110d2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
@@ -8,6 +8,18 @@ tf_module {
     name: "CategoricalCrossentropy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CategoricalHinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CosineProximity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Hinge"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "MeanAbsoluteError"
     mtype: "<type \'type\'>"
@@ -24,6 +36,10 @@ tf_module {
     name: "MeanSquaredLogarithmicError"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SquaredHinge"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "KLD"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -58,11 +74,11 @@ tf_module {
   }
   member_method {
     name: "cosine"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
   member_method {
     name: "cosine_proximity"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
   member_method {
     name: "deserialize"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d173fc879f9608f1e07bed1a5f59de8ba1addc22
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.CategoricalHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'categorical_hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-proximity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-proximity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c50638740f5fdb9028a9843b0597a31506007b8f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-proximity.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.CosineProximity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CosineProximity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'axis\'], varargs=None, keywords=None, defaults=[\'cosine_proximity\', \'None\', \'-1\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7f13c75583b30d066318dcba4673adfa52772a4e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.Hinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Hinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..896320004c10378d264863486c7d61f65d042726
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.MeanAbsoluteError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanAbsoluteError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_absolute_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37ab1a409b05ee4bd99b150c01fa3fcc6ce758fc
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.MeanAbsolutePercentageError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanAbsolutePercentageError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_absolute_percentage_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9025abbb0a5954efa3790dc9f8e84b621acbed32
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.MeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_squared_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..808853994d0cabce5926a85f31698da154e1ba64
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.MeanSquaredLogarithmicError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanSquaredLogarithmicError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_squared_logarithmic_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..38f94048cc430c1dddfd0b2aae5dee47c18ed742
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.SquaredHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SquaredHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'squared_hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
index 905021dd790205e64a6f9839218200db98941927..54f5b21f556159d3f1652017677c6397c78ad9f1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
@@ -12,6 +12,14 @@ tf_module {
     name: "CategoricalAccuracy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CategoricalHinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CosineProximity"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "FalseNegatives"
     mtype: "<type \'type\'>"
@@ -20,10 +28,30 @@ tf_module {
     name: "FalsePositives"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Hinge"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Mean"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "MeanAbsoluteError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsolutePercentageError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredLogarithmicError"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Precision"
     mtype: "<type \'type\'>"
@@ -44,6 +72,10 @@ tf_module {
     name: "SpecificityAtSensitivity"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SquaredHinge"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TrueNegatives"
     mtype: "<type \'type\'>"
@@ -90,11 +122,11 @@ tf_module {
   }
   member_method {
     name: "cosine"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
   member_method {
     name: "cosine_proximity"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
   member_method {
     name: "deserialize"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
index 773c74e64d13ca4a840b7f599fc2cbe9c161cd03..c7a50969b54e5efc4d338caa79dea76d86bffe8a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
@@ -95,6 +95,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
index 533544d21f2753f785113a30518f4fcbcff96cd7..3900c752c8527f68af2496f99083d80fc9d18106 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -116,6 +116,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
index e3926eb6d4714731d09ff9c5b75a89830c06e7c1..7b876099af6a28d9fca2e5c55aeae5e4610f82a6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -116,6 +116,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
index ba209df7824a9cc076499458e35acd7dcf1eaf35..5bddba8e798618f5b1d0cdc61ddff9725a495fe0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -116,6 +116,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
index 081fb0e08bcd1b35ab44459d1c8eb0857dd14956..62ba8bb59e8af14447fe570ba28c5d0eba7f6af8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
@@ -95,6 +95,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
index 2014a04301618c20af5cf6f1144eb4dbda2479e1..0803feeabd12acb7988459fe6da2748e19b70a5f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
@@ -95,6 +95,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
index 9a87ae9687741090485bd8d4d0d07d359a2015e7..6def32864b9cc660b94d628ccd53dc48a566ea81 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
@@ -91,6 +91,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
index 33afb835ce1d524991c0024bfb87c29a72aac08e..dbf1ac82d33b81c63e5c356ac736f63262797ff0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6a3fe4dd66397bbff4b0b9ca6d195adeb64e3337
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt
@@ -0,0 +1,142 @@
+path: "tensorflow.linalg.LinearOperatorInversion"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_inversion.LinearOperatorInversion\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "operator"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'operator\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
index a9078c8ab5cca078237a29febabdbbd4a8b6c89c..85d902b977ceddd405abb1154a086d7bd29e7848 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
@@ -95,6 +95,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
index 4cfa3bb30d7382f3cf3cc0d5ce412d230d2a4287..638d82a599248e547bcae86ebd6d8d8dc3f6aa4b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
@@ -115,6 +115,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
index a87649133fd207ad59f2124c6b0b5aa44916e5a5..ab1b04bd3cb1b215b848019b6c578ce091f8f828 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
@@ -91,6 +91,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
index 32656467840fbbc0c8708ea68aac5aa75c11a540..961969aac58b78e4edd53b47f2932f71f2d21fd5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
index 49d8890c8942bc0021886ee6c9bc4e7625452655..e76738a9648123414159fdc9666a99b0577aa46e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -91,6 +91,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
index c89dc067b331603e227d9d578147e2dd1ee4a900..b35cd69da474a9665652f04f12b34a8d9f33fa8a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
index 9f7b422fabcd55aed98bc93f01143d35698c0399..5e49b75c3131b989c765ab03659fb225cc23e26e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
@@ -36,6 +36,10 @@ tf_module {
     name: "LinearOperatorIdentity"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LinearOperatorInversion"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LinearOperatorKronecker"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index caa15280d1a1c46acd0242b8e184af48f78bdc73..deb1d100e71bf9353ce10f0c1044e0d893d24800 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -1080,6 +1080,10 @@ tf_module {
     name: "disable_resource_variables"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "disable_v2_batch_normalization"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "disable_v2_behavior"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -1124,6 +1128,10 @@ tf_module {
     name: "enable_resource_variables"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "enable_v2_batch_normalization"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "enable_v2_behavior"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-event.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-event.pbtxt
deleted file mode 100644
index 3b75a1735be76fe77689736e492c42c54ab795c1..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-event.pbtxt
+++ /dev/null
@@ -1,74 +0,0 @@
-path: "tensorflow.Event"
-tf_proto {
-  descriptor {
-    name: "Event"
-    field {
-      name: "wall_time"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_DOUBLE
-    }
-    field {
-      name: "step"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "file_version"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-      oneof_index: 0
-    }
-    field {
-      name: "graph_def"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-      oneof_index: 0
-    }
-    field {
-      name: "summary"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.Summary"
-      oneof_index: 0
-    }
-    field {
-      name: "log_message"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.LogMessage"
-      oneof_index: 0
-    }
-    field {
-      name: "session_log"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.SessionLog"
-      oneof_index: 0
-    }
-    field {
-      name: "tagged_run_metadata"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TaggedRunMetadata"
-      oneof_index: 0
-    }
-    field {
-      name: "meta_graph_def"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-      oneof_index: 0
-    }
-    oneof_decl {
-      name: "what"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-summary-metadata.-plugin-data.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-summary-metadata.-plugin-data.pbtxt
deleted file mode 100644
index a66b74b315c6132e8f884bd52e7a3b5bd7f52ccd..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-summary-metadata.-plugin-data.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.SummaryMetadata.PluginData"
-tf_proto {
-  descriptor {
-    name: "PluginData"
-    field {
-      name: "plugin_name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "content"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-summary-metadata.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-summary-metadata.pbtxt
deleted file mode 100644
index c02575b9626c848e9b871d2cc6febb26a5142f08..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-summary-metadata.pbtxt
+++ /dev/null
@@ -1,40 +0,0 @@
-path: "tensorflow.SummaryMetadata"
-tf_proto {
-  descriptor {
-    name: "SummaryMetadata"
-    field {
-      name: "plugin_data"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.SummaryMetadata.PluginData"
-    }
-    field {
-      name: "display_name"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "summary_description"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    nested_type {
-      name: "PluginData"
-      field {
-        name: "plugin_name"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "content"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_BYTES
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-summary.-audio.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-summary.-audio.pbtxt
deleted file mode 100644
index 94f712073e0d0dda201fcf7adba849dd45a1229b..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-summary.-audio.pbtxt
+++ /dev/null
@@ -1,36 +0,0 @@
-path: "tensorflow.Summary.Audio"
-tf_proto {
-  descriptor {
-    name: "Audio"
-    field {
-      name: "sample_rate"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_FLOAT
-    }
-    field {
-      name: "num_channels"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "length_frames"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "encoded_audio_string"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-    }
-    field {
-      name: "content_type"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-summary.-image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-summary.-image.pbtxt
deleted file mode 100644
index fc1acb483b3051cba01f5d9bc8501a61965bbc37..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-summary.-image.pbtxt
+++ /dev/null
@@ -1,30 +0,0 @@
-path: "tensorflow.Summary.Image"
-tf_proto {
-  descriptor {
-    name: "Image"
-    field {
-      name: "height"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "width"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "colorspace"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "encoded_image_string"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-summary.-value.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-summary.-value.pbtxt
deleted file mode 100644
index feb84b6ee996549ac58aa0e8a4ac560f947b6339..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-summary.-value.pbtxt
+++ /dev/null
@@ -1,74 +0,0 @@
-path: "tensorflow.Summary.Value"
-tf_proto {
-  descriptor {
-    name: "Value"
-    field {
-      name: "node_name"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "tag"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "metadata"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.SummaryMetadata"
-    }
-    field {
-      name: "simple_value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_FLOAT
-      oneof_index: 0
-    }
-    field {
-      name: "obsolete_old_style_histogram"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-      oneof_index: 0
-    }
-    field {
-      name: "image"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.Summary.Image"
-      oneof_index: 0
-    }
-    field {
-      name: "histo"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.HistogramProto"
-      oneof_index: 0
-    }
-    field {
-      name: "audio"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.Summary.Audio"
-      oneof_index: 0
-    }
-    field {
-      name: "tensor"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorProto"
-      oneof_index: 0
-    }
-    oneof_decl {
-      name: "value"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-summary.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-summary.pbtxt
deleted file mode 100644
index b2bdff7171804aae114d1e3631e3074b1e4006ba..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-summary.pbtxt
+++ /dev/null
@@ -1,144 +0,0 @@
-path: "tensorflow.Summary"
-tf_proto {
-  descriptor {
-    name: "Summary"
-    field {
-      name: "value"
-      number: 1
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.Summary.Value"
-    }
-    nested_type {
-      name: "Image"
-      field {
-        name: "height"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      field {
-        name: "width"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      field {
-        name: "colorspace"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      field {
-        name: "encoded_image_string"
-        number: 4
-        label: LABEL_OPTIONAL
-        type: TYPE_BYTES
-      }
-    }
-    nested_type {
-      name: "Audio"
-      field {
-        name: "sample_rate"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_FLOAT
-      }
-      field {
-        name: "num_channels"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_INT64
-      }
-      field {
-        name: "length_frames"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_INT64
-      }
-      field {
-        name: "encoded_audio_string"
-        number: 4
-        label: LABEL_OPTIONAL
-        type: TYPE_BYTES
-      }
-      field {
-        name: "content_type"
-        number: 5
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-    }
-    nested_type {
-      name: "Value"
-      field {
-        name: "node_name"
-        number: 7
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "tag"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "metadata"
-        number: 9
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.SummaryMetadata"
-      }
-      field {
-        name: "simple_value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_FLOAT
-        oneof_index: 0
-      }
-      field {
-        name: "obsolete_old_style_histogram"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_BYTES
-        oneof_index: 0
-      }
-      field {
-        name: "image"
-        number: 4
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.Summary.Image"
-        oneof_index: 0
-      }
-      field {
-        name: "histo"
-        number: 5
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.HistogramProto"
-        oneof_index: 0
-      }
-      field {
-        name: "audio"
-        number: 6
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.Summary.Audio"
-        oneof_index: 0
-      }
-      field {
-        name: "tensor"
-        number: 8
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.TensorProto"
-        oneof_index: 0
-      }
-      oneof_decl {
-        name: "value"
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
index 2d115904925eb96164484300baf628d41d3fcff4..a262c0f799634470090eeba90f480f94ac671f87 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
@@ -86,7 +86,7 @@ tf_module {
   }
   member_method {
     name: "bucket_by_sequence_length"
-    argspec: "args=[\'element_length_func\', \'bucket_boundaries\', \'bucket_batch_sizes\', \'padded_shapes\', \'padding_values\', \'pad_to_bucket_boundary\', \'no_padding\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'element_length_func\', \'bucket_boundaries\', \'bucket_batch_sizes\', \'padded_shapes\', \'padding_values\', \'pad_to_bucket_boundary\', \'no_padding\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "cardinality"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
index b06c73d12602b25426034f801be329fb88067011..9c29067b6d83a1e753f01ca62e89bfef559fc824 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "experimental_initialize"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
+  }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt
index df707e8920e4488ed6b40a7f93f56b5624188c84..22f8160c964de394a7cc4a51d1d17218a0da8b56 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt
@@ -26,6 +26,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'strategy\', \'replica_id_in_sync_group\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "all_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_call"
     argspec: "args=[\'self\', \'merge_fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
index 77706e57133e1186d9e98fcf9205ed4c91772eda..37b620891fbf84d33919365aa3efbcd1055ac762 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "colocate_vars_with"
     argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_make_numpy_dataset"
+    argspec: "args=[\'self\', \'numpy_input\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "experimental_run_steps_on_iterator"
     argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
index 9a1df5514261a47aae6f3d11be78b5a6fa6da919..4aa6f1c4e144366d194535d107a32f888caf8a54 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
@@ -74,6 +74,10 @@ tf_class {
     name: "experimental_initialize"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
+  }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.-g-file.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.-g-file.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c6bf57a88fc1295da13e0b58671191c9d8ba8caa
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.-g-file.pbtxt
@@ -0,0 +1,58 @@
+path: "tensorflow.io.gfile.GFile"
+tf_class {
+  is_instance: "<class \'tensorflow.python.platform.gfile.GFile\'>"
+  is_instance: "<class \'tensorflow.python.lib.io.file_io.FileIO\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "mode"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flush"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "next"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'n\'], varargs=None, keywords=None, defaults=[\'-1\'], "
+  }
+  member_method {
+    name: "readline"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "readlines"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "seek"
+    argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "tell"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "write"
+    argspec: "args=[\'self\', \'file_content\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt
index cfa3372b12bfe32eed4311c89b6448c0359c0913..a797c06ff337cffe503d89c09497996ea64c6ad2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.io.gfile"
 tf_module {
+  member {
+    name: "GFile"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "copy"
     argspec: "args=[\'src\', \'dst\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ba9e57bed4100437c8b71d8b506cc2c928a9ac9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-hinge.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.CategoricalHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CategoricalHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-cosine-proximity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-cosine-proximity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4952a76291c00bfdd73eed5412e7421887d1bab2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-cosine-proximity.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.CosineProximity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CosineProximity\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7b3c62d3bef0b9d200577f34cbe303fc7a094acc
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-hinge.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.Hinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Hinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-squared-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f8badb2b6ec2ba8dd16136c32f5d27811a0d4d9d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-squared-hinge.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.SquaredHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.SquaredHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
index c198096d252cd9a3706bcbf6f1e4a1199ec7a1f7..673593ed6c0bc0a588300ad8aa4bd6897d65bc32 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
@@ -8,6 +8,18 @@ tf_module {
     name: "CategoricalCrossentropy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CategoricalHinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CosineProximity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Hinge"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "MeanAbsoluteError"
     mtype: "<type \'type\'>"
@@ -28,6 +40,10 @@ tf_module {
     name: "Reduction"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SquaredHinge"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "KLD"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -62,11 +78,11 @@ tf_module {
   }
   member_method {
     name: "cosine"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
   member_method {
     name: "cosine_proximity"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
   member_method {
     name: "deserialize"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d173fc879f9608f1e07bed1a5f59de8ba1addc22
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.CategoricalHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'categorical_hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-proximity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-proximity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c50638740f5fdb9028a9843b0597a31506007b8f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-proximity.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.CosineProximity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CosineProximity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'axis\'], varargs=None, keywords=None, defaults=[\'cosine_proximity\', \'None\', \'-1\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7f13c75583b30d066318dcba4673adfa52772a4e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.Hinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Hinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..896320004c10378d264863486c7d61f65d042726
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.MeanAbsoluteError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanAbsoluteError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_absolute_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37ab1a409b05ee4bd99b150c01fa3fcc6ce758fc
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.MeanAbsolutePercentageError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanAbsolutePercentageError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_absolute_percentage_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9025abbb0a5954efa3790dc9f8e84b621acbed32
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.MeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_squared_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..808853994d0cabce5926a85f31698da154e1ba64
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.MeanSquaredLogarithmicError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanSquaredLogarithmicError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_squared_logarithmic_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..38f94048cc430c1dddfd0b2aae5dee47c18ed742
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.SquaredHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SquaredHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'squared_hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
index 905021dd790205e64a6f9839218200db98941927..54f5b21f556159d3f1652017677c6397c78ad9f1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
@@ -12,6 +12,14 @@ tf_module {
     name: "CategoricalAccuracy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CategoricalHinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CosineProximity"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "FalseNegatives"
     mtype: "<type \'type\'>"
@@ -20,10 +28,30 @@ tf_module {
     name: "FalsePositives"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Hinge"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Mean"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "MeanAbsoluteError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsolutePercentageError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredLogarithmicError"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Precision"
     mtype: "<type \'type\'>"
@@ -44,6 +72,10 @@ tf_module {
     name: "SpecificityAtSensitivity"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SquaredHinge"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TrueNegatives"
     mtype: "<type \'type\'>"
@@ -90,11 +122,11 @@ tf_module {
   }
   member_method {
     name: "cosine"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
   member_method {
     name: "cosine_proximity"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
   member_method {
     name: "deserialize"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
index 78829def67d11e422aa33e06434e78d3048382d9..88fd96c71d62fb9ae6c846c9c3bfa0b07af6f743 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
@@ -1,7 +1,6 @@
 path: "tensorflow.keras.optimizers.Adamax"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adamax.Adamax\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adam.Adam\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
index 8c63a7dda98568b24ea1b3cda15d4c840fbfd804..7b824e7f0bd4a0d88f733c6cedfde669c01ef55f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.Nadam"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Nadam\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.nadam.Nadam\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'schedule_decay\'], varargs=None, keywords=kwargs, defaults=[\'0.002\', \'0.9\', \'0.999\', \'None\', \'0.004\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'Nadam\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
index 773c74e64d13ca4a840b7f599fc2cbe9c161cd03..c7a50969b54e5efc4d338caa79dea76d86bffe8a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
@@ -95,6 +95,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
index 533544d21f2753f785113a30518f4fcbcff96cd7..3900c752c8527f68af2496f99083d80fc9d18106 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -116,6 +116,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
index e3926eb6d4714731d09ff9c5b75a89830c06e7c1..7b876099af6a28d9fca2e5c55aeae5e4610f82a6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -116,6 +116,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
index ba209df7824a9cc076499458e35acd7dcf1eaf35..5bddba8e798618f5b1d0cdc61ddff9725a495fe0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -116,6 +116,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
index 081fb0e08bcd1b35ab44459d1c8eb0857dd14956..62ba8bb59e8af14447fe570ba28c5d0eba7f6af8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
@@ -95,6 +95,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
index 2014a04301618c20af5cf6f1144eb4dbda2479e1..0803feeabd12acb7988459fe6da2748e19b70a5f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
@@ -95,6 +95,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
index 9a87ae9687741090485bd8d4d0d07d359a2015e7..6def32864b9cc660b94d628ccd53dc48a566ea81 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
@@ -91,6 +91,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
index 33afb835ce1d524991c0024bfb87c29a72aac08e..dbf1ac82d33b81c63e5c356ac736f63262797ff0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6a3fe4dd66397bbff4b0b9ca6d195adeb64e3337
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt
@@ -0,0 +1,142 @@
+path: "tensorflow.linalg.LinearOperatorInversion"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_inversion.LinearOperatorInversion\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "operator"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'operator\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
index a9078c8ab5cca078237a29febabdbbd4a8b6c89c..85d902b977ceddd405abb1154a086d7bd29e7848 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
@@ -95,6 +95,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
index 4cfa3bb30d7382f3cf3cc0d5ce412d230d2a4287..638d82a599248e547bcae86ebd6d8d8dc3f6aa4b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
@@ -115,6 +115,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
index a87649133fd207ad59f2124c6b0b5aa44916e5a5..ab1b04bd3cb1b215b848019b6c578ce091f8f828 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
@@ -91,6 +91,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
index 32656467840fbbc0c8708ea68aac5aa75c11a540..961969aac58b78e4edd53b47f2932f71f2d21fd5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
index 49d8890c8942bc0021886ee6c9bc4e7625452655..e76738a9648123414159fdc9666a99b0577aa46e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -91,6 +91,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
index c89dc067b331603e227d9d578147e2dd1ee4a900..b35cd69da474a9665652f04f12b34a8d9f33fa8a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
index 3e1e2e3d54de3e2442299a783f933a60dfd2db6d..f9119cdd5f728f3b35d83248daff17547a497aa2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
@@ -36,6 +36,10 @@ tf_module {
     name: "LinearOperatorIdentity"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LinearOperatorInversion"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LinearOperatorKronecker"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index 0cf1fe0e3c311ba7d436adaf1b25c7b00b5b5f9c..7e5f86d7e6c812bd9c00dd7536b2442a08c844ed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -8,10 +8,6 @@ tf_module {
     name: "DType"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "Event"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member {
     name: "GradientTape"
     mtype: "<type \'type\'>"
@@ -40,14 +36,6 @@ tf_module {
     name: "SparseTensor"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "Summary"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "SummaryMetadata"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member {
     name: "Tensor"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-event.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-event.pbtxt
deleted file mode 100644
index eb99d0f5334457aa654fed0553af143839328dba..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-event.pbtxt
+++ /dev/null
@@ -1,74 +0,0 @@
-path: "tensorflow.summary.Event"
-tf_proto {
-  descriptor {
-    name: "Event"
-    field {
-      name: "wall_time"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_DOUBLE
-    }
-    field {
-      name: "step"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "file_version"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-      oneof_index: 0
-    }
-    field {
-      name: "graph_def"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-      oneof_index: 0
-    }
-    field {
-      name: "summary"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.Summary"
-      oneof_index: 0
-    }
-    field {
-      name: "log_message"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.LogMessage"
-      oneof_index: 0
-    }
-    field {
-      name: "session_log"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.SessionLog"
-      oneof_index: 0
-    }
-    field {
-      name: "tagged_run_metadata"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TaggedRunMetadata"
-      oneof_index: 0
-    }
-    field {
-      name: "meta_graph_def"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-      oneof_index: 0
-    }
-    oneof_decl {
-      name: "what"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-file-writer-cache.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-file-writer-cache.pbtxt
deleted file mode 100644
index 2a5b63dceae3c0ac27b34c2e896ee3b90bbd7f75..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-file-writer-cache.pbtxt
+++ /dev/null
@@ -1,16 +0,0 @@
-path: "tensorflow.summary.FileWriterCache"
-tf_class {
-  is_instance: "<class \'tensorflow.python.summary.writer.writer_cache.FileWriterCache\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "clear"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get"
-    argspec: "args=[\'logdir\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-file-writer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-file-writer.pbtxt
deleted file mode 100644
index 6b65b0ace3cf7740ab03390841c941592000d127..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-file-writer.pbtxt
+++ /dev/null
@@ -1,50 +0,0 @@
-path: "tensorflow.summary.FileWriter"
-tf_class {
-  is_instance: "<class \'tensorflow.python.summary.writer.writer.FileWriter\'>"
-  is_instance: "<class \'tensorflow.python.summary.writer.writer.SummaryToEventTransformer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'logdir\', \'graph\', \'max_queue\', \'flush_secs\', \'graph_def\', \'filename_suffix\', \'session\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'120\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_event"
-    argspec: "args=[\'self\', \'event\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "add_graph"
-    argspec: "args=[\'self\', \'graph\', \'global_step\', \'graph_def\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_meta_graph"
-    argspec: "args=[\'self\', \'meta_graph_def\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_run_metadata"
-    argspec: "args=[\'self\', \'run_metadata\', \'tag\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_session_log"
-    argspec: "args=[\'self\', \'session_log\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_summary"
-    argspec: "args=[\'self\', \'summary\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "flush"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_logdir"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reopen"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-description.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-description.pbtxt
deleted file mode 100644
index 4a8b59cf02ed46ef70f22564f3134214840600fe..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-description.pbtxt
+++ /dev/null
@@ -1,12 +0,0 @@
-path: "tensorflow.summary.SummaryDescription"
-tf_proto {
-  descriptor {
-    name: "SummaryDescription"
-    field {
-      name: "type_hint"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-audio.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-audio.pbtxt
deleted file mode 100644
index 8b271cf58fc11c8666abd456021afeedc0b14c7a..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-audio.pbtxt
+++ /dev/null
@@ -1,36 +0,0 @@
-path: "tensorflow.summary.Summary.Audio"
-tf_proto {
-  descriptor {
-    name: "Audio"
-    field {
-      name: "sample_rate"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_FLOAT
-    }
-    field {
-      name: "num_channels"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "length_frames"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "encoded_audio_string"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-    }
-    field {
-      name: "content_type"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-image.pbtxt
deleted file mode 100644
index dbbc02dd0506dbcebd1690602b5786b02c3ed4a0..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-image.pbtxt
+++ /dev/null
@@ -1,30 +0,0 @@
-path: "tensorflow.summary.Summary.Image"
-tf_proto {
-  descriptor {
-    name: "Image"
-    field {
-      name: "height"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "width"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "colorspace"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "encoded_image_string"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-value.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-value.pbtxt
deleted file mode 100644
index 4176171cd938e383fe5366153364d8e8e8c1a1ee..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-value.pbtxt
+++ /dev/null
@@ -1,74 +0,0 @@
-path: "tensorflow.summary.Summary.Value"
-tf_proto {
-  descriptor {
-    name: "Value"
-    field {
-      name: "node_name"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "tag"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "metadata"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.SummaryMetadata"
-    }
-    field {
-      name: "simple_value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_FLOAT
-      oneof_index: 0
-    }
-    field {
-      name: "obsolete_old_style_histogram"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-      oneof_index: 0
-    }
-    field {
-      name: "image"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.Summary.Image"
-      oneof_index: 0
-    }
-    field {
-      name: "histo"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.HistogramProto"
-      oneof_index: 0
-    }
-    field {
-      name: "audio"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.Summary.Audio"
-      oneof_index: 0
-    }
-    field {
-      name: "tensor"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorProto"
-      oneof_index: 0
-    }
-    oneof_decl {
-      name: "value"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.pbtxt
deleted file mode 100644
index d6c5e3a87a115b9bdcfd044abe93177eda2af275..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.pbtxt
+++ /dev/null
@@ -1,144 +0,0 @@
-path: "tensorflow.summary.Summary"
-tf_proto {
-  descriptor {
-    name: "Summary"
-    field {
-      name: "value"
-      number: 1
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.Summary.Value"
-    }
-    nested_type {
-      name: "Image"
-      field {
-        name: "height"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      field {
-        name: "width"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      field {
-        name: "colorspace"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      field {
-        name: "encoded_image_string"
-        number: 4
-        label: LABEL_OPTIONAL
-        type: TYPE_BYTES
-      }
-    }
-    nested_type {
-      name: "Audio"
-      field {
-        name: "sample_rate"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_FLOAT
-      }
-      field {
-        name: "num_channels"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_INT64
-      }
-      field {
-        name: "length_frames"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_INT64
-      }
-      field {
-        name: "encoded_audio_string"
-        number: 4
-        label: LABEL_OPTIONAL
-        type: TYPE_BYTES
-      }
-      field {
-        name: "content_type"
-        number: 5
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-    }
-    nested_type {
-      name: "Value"
-      field {
-        name: "node_name"
-        number: 7
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "tag"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "metadata"
-        number: 9
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.SummaryMetadata"
-      }
-      field {
-        name: "simple_value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_FLOAT
-        oneof_index: 0
-      }
-      field {
-        name: "obsolete_old_style_histogram"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_BYTES
-        oneof_index: 0
-      }
-      field {
-        name: "image"
-        number: 4
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.Summary.Image"
-        oneof_index: 0
-      }
-      field {
-        name: "histo"
-        number: 5
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.HistogramProto"
-        oneof_index: 0
-      }
-      field {
-        name: "audio"
-        number: 6
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.Summary.Audio"
-        oneof_index: 0
-      }
-      field {
-        name: "tensor"
-        number: 8
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.TensorProto"
-        oneof_index: 0
-      }
-      oneof_decl {
-        name: "value"
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-tagged-run-metadata.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-tagged-run-metadata.pbtxt
deleted file mode 100644
index 27c8873320403cb2e7402ef9f1bb0e7134d5f96b..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-tagged-run-metadata.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.summary.TaggedRunMetadata"
-tf_proto {
-  descriptor {
-    name: "TaggedRunMetadata"
-    field {
-      name: "tag"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "run_metadata"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
index 61670bd15122f65ef05d20ee5d023a3c326f7757..c59f1b8474302b5529895b8aa9216a2e197d958f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
@@ -1,33 +1,9 @@
 path: "tensorflow.summary"
 tf_module {
-  member {
-    name: "Event"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "FileWriter"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "FileWriterCache"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Summary"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "SummaryDescription"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member {
     name: "SummaryWriter"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "TaggedRunMetadata"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member_method {
     name: "create_file_writer"
     argspec: "args=[\'logdir\', \'max_queue\', \'flush_millis\', \'filename_suffix\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
index c72564e5987de36a95f7f44bae2b8122dcf256c4..a3ace15ca2cfe15cfd8f3ab98d9fabb603f0131e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
@@ -140,8 +140,4 @@ tf_module {
     name: "sdca_shrink_l1"
     argspec: "args=[\'weights\', \'l1\', \'l2\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "summary_iterator"
-    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu b/tensorflow/tools/ci_build/Dockerfile.gpu
index a4cad4b6c65c35651e58495c8f1b8b4c5b5f38d8..f5a28ff16352d5428ac698f2cc7f73b0b1ba3394 100644
--- a/tensorflow/tools/ci_build/Dockerfile.gpu
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
 
 LABEL maintainer="Jan Prach <jendap@google.com>"
 
@@ -7,6 +7,12 @@ LABEL maintainer="Jan Prach <jendap@google.com>"
 RUN cp -P /usr/include/cudnn.h /usr/local/cuda/include
 RUN cp -P /usr/lib/x86_64-linux-gnu/libcudnn* /usr/local/cuda/lib64
 
+# Installs TensorRT, which is not included in NVIDIA Docker containers.
+RUN apt-get update \
+        && apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends libnvinfer-dev=5.0.2-1+cuda10.0
+
 # Copy and run the install scripts.
 COPY install/*.sh /install/
 ARG DEBIAN_FRONTEND=noninteractive
@@ -24,7 +30,7 @@ COPY install/.bazelrc /etc/bazel.bazelrc
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
 
 # Link NCCL libray and header where the build script expects them.
-RUN mkdir /usr/local/cuda-9.0/lib &&  \
+RUN mkdir /usr/local/cuda/lib &&  \
     ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
     ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cpu b/tensorflow/tools/ci_build/Dockerfile.rbe.cpu
index 7e5860aeec186d908e5d2884bd690b2e5e43cffa..500fb6e0b3a995a91f0faf6555e2e248babbfda1 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cpu
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cpu
@@ -1,3 +1,8 @@
+# To push a new version, run:
+# $ docker build -f Dockerfile.rbe.cpu \
+#       --tag "gcr.io/tensorflow-testing/nosla-ubuntu16.04" .
+# $ docker push gcr.io/tensorflow-testing/nosla-ubuntu16.04
+
 FROM launcher.gcr.io/google/rbe-ubuntu16-04:r327695
 LABEL maintainer="Yu Yi <yiyu@google.com>"
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04
index 4fe86066c91b2baa665070a6fd9d34ebc74bdab7..d08d31d91304d45c317fdb4b6dec5b05494f7e9b 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04
@@ -1,7 +1,7 @@
 # To push a new version, run:
 # $ docker build -f Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04 \
-#       --tag "gcr.io/asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04" .
-# $ docker push gcr.io/asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04
+#       --tag "gcr.io/tensorflow-testing/nosla-cuda10.0-cudnn7-ubuntu14.04" .
+# $ docker push gcr.io/tensorflow-testing/nosla-cuda10.0-cudnn7-ubuntu14.04
 
 FROM ubuntu:14.04
 LABEL maintainer="Manuel Klimek <klimek@google.com>"
@@ -19,7 +19,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates
 ENV CUDA_VERSION 10.0.130
 ENV CUDA_PKG_VERSION 10-0=$CUDA_VERSION-1
 ENV CUDNN_VERSION 7.3.1.20
-ENV NCCL_VERSION 2.3.5
 ENV TENSORRT_VERSION 5.0.2
 ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
 ENV NVIDIA_REQUIRE_CUDA "cuda>=10.0,driver>=410"
@@ -48,25 +47,26 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libcudnn7=$CUDNN_VERSION-1+cuda10.0 \
         libcudnn7=$CUDNN_VERSION-1+cuda10.0 \
         libcudnn7-dev=$CUDNN_VERSION-1+cuda10.0 \
-        libnccl2=$NCCL_VERSION-2+cuda10.0 \
-        libnccl-dev=$NCCL_VERSION-2+cuda10.0 \
         nvinfer-runtime-trt-repo-ubuntu1604-$TENSORRT_VERSION-ga-cuda10.0 && \
     apt-get update && apt-get install -y --no-install-recommends \
         libnvinfer5=$TENSORRT_VERSION-1+cuda10.0 \
         libnvinfer-dev=$TENSORRT_VERSION-1+cuda10.0 && \
     ln -s cuda-10.0 /usr/local/cuda && \
     apt-mark hold libcudnn7 && \
-    apt-mark hold libnccl2 && \
     rm -rf /var/lib/apt/lists/*
 
 # TODO(b/110903506): Provide a link to the SONAME of libcuda.so.
 # https://github.com/NVIDIA/nvidia-docker/issues/775
 RUN ln -s libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
 
-# TODO(klimek): Once the TODO in tensorflow's configure.py to correctly find
-# libnccl is resolved, delete this block.
-RUN ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so \
- && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so.2
+# Install a newer version of libstdc++, as new clang versions do not work
+# with the stock ubuntu 14.04 libstdc++.
+RUN apt-get update && \
+    apt-get install -y software-properties-common && \
+    add-apt-repository ppa:ubuntu-toolchain-r/test -y && \
+    apt-get update && \
+    apt-get install -y libstdc++-7-dev && \
+    rm -rf /var/lib/apt/lists/*
 
 # Copy and run the install scripts.
 COPY install/*.sh /install/
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04
index 60a23e1edbced8dbef738e290353cdfb60ea86a6..4ce4214065fbddd4769a4a35941e3b752aa49c9c 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04
@@ -1,7 +1,7 @@
 # To push a new version, run:
 # $ docker build -f Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04 \
-#       --tag "gcr.io/asci-toolchain/nosla-cuda9.0-cudnn7-ubuntu14.04" .
-# $ docker push gcr.io/asci-toolchain/nosla-cuda9.0-cudnn7-ubuntu14.04
+#       --tag "gcr.io/tensorflow-testing/nosla-cuda9.0-cudnn7-ubuntu14.04" .
+# $ docker push gcr.io/tensorflow-testing/nosla-cuda9.0-cudnn7-ubuntu14.04
 #
 # TODO(klimek): Include clang in this image so we can also target clang
 # builds.
@@ -25,7 +25,6 @@ ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
 ENV NVIDIA_VISIBLE_DEVICES all
 ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
 ENV NVIDIA_REQUIRE_CUDA "cuda>=9.0"
-ENV NCCL_VERSION 2.2.13
 ENV TENSORRT_VERSION 5.0.2
 ENV CUDNN_VERSION 7.1.4.18
 
@@ -45,14 +44,12 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-cudart-$CUDA_PKG_VERSION \
         cuda-libraries-$CUDA_PKG_VERSION \
         cuda-cublas-9-0=9.0.176.4-1 \
-        libnccl2=$NCCL_VERSION-1+cuda9.0 \
         cuda-libraries-dev-$CUDA_PKG_VERSION \
         cuda-nvml-dev-$CUDA_PKG_VERSION \
         cuda-minimal-build-$CUDA_PKG_VERSION \
         cuda-command-line-tools-$CUDA_PKG_VERSION \
         cuda-core-9-0=9.0.176.3-1 \
         cuda-cublas-dev-9-0=9.0.176.4-1 \
-        libnccl-dev=$NCCL_VERSION-1+cuda9.0 \
         libcudnn7-dev=$CUDNN_VERSION-1+cuda9.0 \
         libcudnn7=$CUDNN_VERSION-1+cuda9.0 \
         nvinfer-runtime-trt-repo-ubuntu1604-$TENSORRT_VERSION-ga-cuda9.0 && \
@@ -60,7 +57,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libnvinfer5=$TENSORRT_VERSION-1+cuda9.0 \
         libnvinfer-dev=$TENSORRT_VERSION-1+cuda9.0 && \
     ln -s cuda-9.0 /usr/local/cuda && \
-    apt-mark hold libnccl2 && \
     apt-mark hold libcudnn7 libcudnn7-dev && \
     rm -rf /var/lib/apt/lists/*
 
@@ -71,11 +67,6 @@ RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
 # https://github.com/NVIDIA/nvidia-docker/issues/775
 RUN ln -s libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
 
-# TODO(klimek): Once the TODO in tensorflow's configure.py to correctly find
-# libnccl is resolved, delete this block.
-RUN ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so \
- && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so.2
-
 # Install a newer version of libstdc++, as new clang versions do not work
 # with the stock ubuntu 14.04 libstdc++.
 RUN apt-get update && \
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.gpu b/tensorflow/tools/ci_build/Dockerfile.rbe.gpu
index b65620583676f7ae2a4e849e33df05a18c4c9a24..c4912a65b65d61c6154be5083805d430d697f662 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.gpu
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.gpu
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
 
 LABEL maintainer="Nick Lopez <ngiraldo@google.com>"
 
diff --git a/tensorflow/tools/ci_build/builds/run_pip_tests.sh b/tensorflow/tools/ci_build/builds/run_pip_tests.sh
index 7d5cf3f8439e223e0e8591333e727b2e58ca275c..a095633a22e8b24a4561ad3e13902a34424717ae 100755
--- a/tensorflow/tools/ci_build/builds/run_pip_tests.sh
+++ b/tensorflow/tools/ci_build/builds/run_pip_tests.sh
@@ -88,7 +88,8 @@ if [[ ${IS_GPU} == "1" ]]; then
   PIP_TEST_FILTER_TAG="-no_gpu,-no_pip_gpu,${PIP_TEST_FILTER_TAG}"
 fi
 if [[ ${IS_MAC} == "1" ]]; then
-  PIP_TEST_FILTER_TAG="-nomac,${PIP_TEST_FILTER_TAG}"
+  # TODO(b/122370901): Fix nomac, no_mac inconsistency.
+  PIP_TEST_FILTER_TAG="-nomac,-no_mac,${PIP_TEST_FILTER_TAG}"
 fi
 
 # Bazel flags we need for all tests:
diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index 435ec7ca68fc28362b9b546f977b24e003e55d2f..41b9f241d518b8345c7c3f9b2e496349d609144c 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -398,7 +398,8 @@ if [[ "${TF_BUILD_APPEND_ARGUMENTS}" == *"--test_tag_filters="* ]]; then
         NEW_ITEM="${NEW_ITEM},-benchmark-test"
       fi
       if [[ ${IS_MAC} == "1" ]] && [[ ${NEW_ITEM} != *"nomac"* ]]; then
-        NEW_ITEM="${NEW_ITEM},-nomac"
+        # TODO(b/122370901): Fix nomac, no_mac inconsistency.
+        NEW_ITEM="${NEW_ITEM},-nomac,-no_mac"
       fi
       EXTRA_ARGS="${EXTRA_ARGS} ${NEW_ITEM}"
     else
@@ -408,11 +409,13 @@ if [[ "${TF_BUILD_APPEND_ARGUMENTS}" == *"--test_tag_filters="* ]]; then
 else
   EXTRA_ARGS="${EXTRA_ARGS} ${TF_BUILD_APPEND_ARGUMENTS} --test_tag_filters=-no_oss,-oss_serial,-benchmark-test"
   if [[ ${IS_MAC} == "1" ]]; then
-    EXTRA_ARGS="${EXTRA_ARGS},-nomac"
+    # TODO(b/122370901): Fix nomac, no_mac inconsistency.
+    EXTRA_ARGS="${EXTRA_ARGS},-nomac,-no_mac"
   fi
   EXTRA_ARGS="${EXTRA_ARGS} --build_tag_filters=-no_oss,-oss_serial,-benchmark-test"
   if [[ ${IS_MAC} == "1" ]]; then
-    EXTRA_ARGS="${EXTRA_ARGS},-nomac"
+    # TODO(b/122370901): Fix nomac, no_mac inconsistency.
+    EXTRA_ARGS="${EXTRA_ARGS},-nomac,-no_mac"
   fi
 fi
 
diff --git a/tensorflow/tools/ci_build/copy_binary.py b/tensorflow/tools/ci_build/copy_binary.py
index 148526492d25e9acebe036294175e2814b2ead12..40a744374564d3ad3e663de8453d4085202c4e0c 100755
--- a/tensorflow/tools/ci_build/copy_binary.py
+++ b/tensorflow/tools/ci_build/copy_binary.py
@@ -33,7 +33,7 @@ import tempfile
 import zipfile
 
 TF_NIGHTLY_REGEX = (r"(.+)tf_nightly(|_gpu)-(\d\.[\d]{1,2}"
-                    "\.\d.dev[\d]{0,8})-(.+)\.whl")
+                    r"\.\d.dev[\d]{0,8})-(.+)\.whl")
 BINARY_STRING_TEMPLATE = "%s-%s-%s.whl"
 
 
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh b/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
index 7be5f454ecd6344cc1b0b79789c2b18acefc448d..a8b73cbe0cfe7fda70483a8b10fee2a7648b138a 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
@@ -36,4 +36,4 @@ yes "" | $PYTHON_BIN_PATH configure.py
 bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test --test_lang_filters=cc,py -k \
     --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only \
     --config=mkl --test_env=KMP_BLOCKTIME=0 --config=opt --test_output=errors -- \
-    //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
+    //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/... -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh b/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh
index 3efd994d783d8f47b3471cc5ce177293b1e017cc..1184d4acec61f36cc630df313d403d33d73e1e7a 100755
--- a/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh
+++ b/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh
@@ -31,6 +31,7 @@ export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python2)
 yes "" | $PYTHON_BIN_PATH configure.py
 which bazel
+# TODO(b/122370901): Fix nomac, no_mac inconsistency.
 bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test,-nomac,-no_mac \
     --test_timeout 300,450,1200,3600 \
     --test_size_filters=small,medium --config=opt \
diff --git a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
index adee0d3171fe13261f177a6f8a3b55aeb5789cc5..d39340b1d83dde254a00fea1ff6090e1df2d10ae 100755
--- a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
+++ b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
@@ -32,6 +32,7 @@ export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python2)
 yes "" | $PYTHON_BIN_PATH configure.py
 which bazel
+# TODO(b/122370901): Fix nomac, no_mac inconsistency.
 bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test,-nomac,-no_mac \
     --test_timeout 300,450,1200,3600 --config=opt \
     --announce_rc \
diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index 864278c6477b4b1e7e9bc3836e3e3d102d086530..987f0769b2d6da4631b6f408af4dbf62d9099f76 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -107,6 +107,7 @@ bazel build -c opt ${PI_COPTS} \
   --copt=-funsafe-math-optimizations --copt=-ftree-vectorize \
   --copt=-fomit-frame-pointer --cpu=armeabi \
   --crosstool_top=@local_config_arm_compiler//:toolchain \
+  --define tensorflow_mkldnn_contraction_kernel=0 \
   --verbose_failures \
   //tensorflow:libtensorflow.so \
   //tensorflow:libtensorflow_framework.so \
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index a9902d77f5ec103fe2000a4a470d425e3998f45e..31dbc02963d60a4943f0683252c86ea0ba1610c0 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -1,17 +1,21 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
-
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_copts",  # @unused
     "tf_cc_test",  # @unused
 )
 
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
 py_library(
     name = "ast_edits",
     srcs = ["ast_edits.py"],
     srcs_version = "PY2AND3",
+    deps = [
+        "@pasta",
+        "@six_archive//:six",
+    ],
 )
 
 py_test(
@@ -65,6 +69,7 @@ py_library(
         ":ast_edits",
         ":renames_v2",
         ":reorders_v2",
+        "@six_archive//:six",
     ],
 )
 
diff --git a/tensorflow/tools/compatibility/ast_edits.py b/tensorflow/tools/compatibility/ast_edits.py
index 9106ec97c8fc79e7cb31595277d95d9638ef2f39..2254c223cea940a48e5172120b969cee4751cf16 100644
--- a/tensorflow/tools/compatibility/ast_edits.py
+++ b/tensorflow/tools/compatibility/ast_edits.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import ast
-import collections
 import os
 import re
 import shutil
@@ -27,6 +26,9 @@ import sys
 import tempfile
 import traceback
 
+import pasta
+import six
+
 # Some regular expressions we will need for parsing
 FIND_OPEN = re.compile(r"^\s*(\[).*$")
 FIND_STRING_CHARS = re.compile(r"['\"]")
@@ -44,264 +46,294 @@ class APIChangeSpec(object):
     notifications)
   * `function_reorders`: maps functions whose argument order has changed to the
     list of arguments in the new order
-  * `function_handle`: maps function names to custom handlers for the function
   * `function_warnings`: maps full names of functions to warnings that will be
     printed out if the function is used. (e.g. tf.nn.convolution())
-  * `unrestricted_function_warnings`: maps names of functions to warnings that
-    will be printed out when the function is used (e.g. foo.convolution()).
-  * `function_keyword_additions`: maps function names to a map of arg->value
-    names that should be passed to the function.
+  * `function_transformers`: maps function names to custom handlers
 
   For an example, see `TFAPIChangeSpec`.
   """
 
 
-class _FileEditTuple(
-    collections.namedtuple("_FileEditTuple",
-                           ["comment", "line", "start", "old", "new"])):
-  """Each edit that is recorded by a _FileEditRecorder.
+class _PastaEditVisitor(ast.NodeVisitor):
+  """AST Visitor that processes function calls.
 
-  Fields:
-    comment: A description of the edit and why it was made.
-    line: The line number in the file where the edit occurs (1-indexed).
-    start: The column number in the file where the edit occurs (0-indexed).
-    old: text string to remove (this must match what was in file).
-    new: text string to add in place of `old`.
+  Updates function calls from old API version to new API version using a given
+  change spec.
   """
 
-  __slots__ = ()
-
-
-class _FileEditRecorder(object):
-  """Record changes that need to be done to the file."""
-
-  def __init__(self, filename):
-    # all edits are lists of chars
-    self._filename = filename
-
-    self._line_to_edit = collections.defaultdict(list)
-    self._errors = []
-
-  def process(self, text):
-    """Process a list of strings, each corresponding to the recorded changes.
-
-    Args:
-      text: A list of lines of text (assumed to contain newlines)
-    Returns:
-      A tuple of the modified text and a textual description of what is done.
-    Raises:
-      ValueError: if substitution source location does not have expected text.
-    """
-
-    change_report = ""
-
-    # Iterate of each line
-    for line, edits in self._line_to_edit.items():
-      offset = 0
-      # sort by column so that edits are processed in order in order to make
-      # indexing adjustments cumulative for changes that change the string
-      # length
-      edits.sort(key=lambda x: x.start)
-
-      # Extract each line to a list of characters, because mutable lists
-      # are editable, unlike immutable strings.
-      char_array = list(text[line - 1])
-
-      # Record a description of the change
-      change_report += "%r Line %d\n" % (self._filename, line)
-      change_report += "-" * 80 + "\n\n"
-      for e in edits:
-        change_report += "%s\n" % e.comment
-      change_report += "\n    Old: %s" % (text[line - 1])
-
-      # Make underscore buffers for underlining where in the line the edit was
-      change_list = [" "] * len(text[line - 1])
-      change_list_new = [" "] * len(text[line - 1])
-
-      # Iterate for each edit
-      for e in edits:
-        # Create effective start, end by accounting for change in length due
-        # to previous edits
-        start_eff = e.start + offset
-        end_eff = start_eff + len(e.old)
-
-        # Make sure the edit is changing what it should be changing
-        old_actual = "".join(char_array[start_eff:end_eff])
-        if old_actual != e.old:
-          raise ValueError("Expected text %r but got %r" %
-                           ("".join(e.old), "".join(old_actual)))
-        # Make the edit
-        char_array[start_eff:end_eff] = list(e.new)
-
-        # Create the underline highlighting of the before and after
-        change_list[e.start:e.start + len(e.old)] = "~" * len(e.old)
-        change_list_new[start_eff:end_eff] = "~" * len(e.new)
-
-        # Keep track of how to generate effective ranges
-        offset += len(e.new) - len(e.old)
-
-      # Finish the report comment
-      change_report += "         %s\n" % "".join(change_list)
-      text[line - 1] = "".join(char_array)
-      change_report += "    New: %s" % (text[line - 1])
-      change_report += "         %s\n\n" % "".join(change_list_new)
-    return "".join(text), change_report, self._errors
-
-  def add(self, comment, line, start, old, new, error=None):
-    """Add a new change that is needed.
+  def __init__(self, api_change_spec):
+    self._api_change_spec = api_change_spec
+    self._log = []   # Holds 3-tuples: line, col, msg.
+    self._errors = []  # Same structure as _log.
+    self._stack = []  # Allow easy access to parents.
 
-    Args:
-      comment: A description of what was changed
-      line: Line number (1 indexed)
-      start: Column offset (0 indexed)
-      old: old text
-      new: new text
-      error: this "edit" is something that cannot be fixed automatically
-    Returns:
-      None
-    """
+  # Overridden to maintain a stack of nodes to allow for parent access
+  def visit(self, node):
+    self._stack.append(node)
+    super(_PastaEditVisitor, self).visit(node)
+    self._stack.pop()
 
-    self._line_to_edit[line].append(
-        _FileEditTuple(comment, line, start, old, new))
-    if error:
-      self._errors.append("%s:%d: %s" % (self._filename, line, error))
+  @property
+  def errors(self):
+    return self._errors
 
+  @property
+  def log(self):
+    return self._log
 
-class _ASTCallVisitor(ast.NodeVisitor):
-  """AST Visitor that processes function calls.
+  def _format_log(self, log):
+    text = ""
+    for log_entry in log:
+      text += "Line %d:%d: %s\n" % log_entry
+    return text
 
-  Updates function calls from old API version to new API version using a given
-  change spec.
-  """
+  def log_text(self):
+    return self._format_log(self.log)
 
-  def __init__(self, filename, lines, api_change_spec):
-    self._filename = filename
-    self._file_edit = _FileEditRecorder(filename)
-    self._lines = lines
-    self._api_change_spec = api_change_spec
+  def add_log(self, lineno, col, msg):
+    self._log.append((lineno, col, msg))
+    print("Line %d:%d: %s" % (lineno, col, msg))
 
-  def process(self, lines):
-    return self._file_edit.process(lines)
+  def add_error(self, lineno, col, msg):
+    # All errors are also added to the regular log.
+    self.add_log(lineno, col, msg)
+    self._errors.append((lineno, col, msg))
 
-  def generic_visit(self, node):
-    ast.NodeVisitor.generic_visit(self, node)
+  def add_logs(self, logs):
+    """Record a log and print it.
 
-  def _rename_functions(self, node, full_name):
-    symbol_renames = self._api_change_spec.symbol_renames
-    try:
-      new_name = symbol_renames[full_name]
-      self._file_edit.add("Renamed function %r to %r" % (full_name, new_name),
-                          node.lineno, node.col_offset, full_name, new_name)
-    except KeyError:
-      pass
+    The log should be a tuple (lineno, col_offset, msg), which will be printed
+    and then recorded. It is part of the log available in the self.log property.
 
-  def _print_warning_for_function(self, node, full_name):
-    function_warnings = self._api_change_spec.function_warnings
-    try:
-      warning_message = function_warnings[full_name]
-      warning_message = warning_message.replace("<function name>", full_name)
-      self._file_edit.add(warning_message,
-                          node.lineno, node.col_offset, full_name, full_name,
-                          error="%s requires manual check." % full_name)
-    except KeyError:
-      pass
+    Args:
+      logs: The log to add. Must be a tuple (lineno, col_offset, msg).
+    """
+    self._log.extend(logs)
+    for log in logs:
+      print("Line %d:%d: %s" % log)
 
-  def _print_warning_for_function_unrestricted(self, node):
-    """Print a warning when specific functions are called.
+  def add_errors(self, errors):
+    """Record an error and print it.
 
-    The function _print_warning_for_function matches the full name of the called
-    function, e.g., tf.foo.bar(). This function matches the function name that
-    is called, as long as the function is an attribute. For example,
-    `tf.foo.bar()` and `foo.bar()` are matched, but not `bar()`.
+    The error must be a tuple (lineno, col_offset, msg), which will be printed
+    and then recorded as both a log and an error. It is therefore part of the
+    log available in the self.log as well as the self.errors property.
 
     Args:
-      node: ast.Call object
+      errors: The log to add. Must be a tuple (lineno, col_offset, msg).
     """
-    function_warnings = getattr(
-        self._api_change_spec, "unrestricted_function_warnings", {})
-    if isinstance(node.func, ast.Attribute):
-      function_name = node.func.attr
-      try:
-        warning_message = function_warnings[function_name]
-        self._file_edit.add(warning_message,
-                            node.lineno, node.col_offset, "", "",
-                            error="%s requires manual check." % function_name)
-      except KeyError:
-        pass
-
-  def _get_attribute_full_path(self, node):
-    """Traverse an attribute to generate a full name e.g. tf.foo.bar.
+    self.add_logs(errors)
+    self._errors.extend(errors)
+
+  def _get_applicable_entries(self, transformer_field, full_name, name):
+    """Get all list entries indexed by name that apply to full_name or name."""
+    # Transformers are indexed to full name, name, or no name
+    # as a performance optimization.
+    function_transformers = getattr(self._api_change_spec,
+                                    transformer_field, {})
+
+    glob_name = "*." + name if name else None
+    transformers = []
+    if full_name in function_transformers:
+      transformers.append(function_transformers[full_name])
+    if glob_name in function_transformers:
+      transformers.append(function_transformers[glob_name])
+    if "*" in function_transformers:
+      transformers.append(function_transformers["*"])
+    return transformers
+
+  def _get_applicable_dict(self, transformer_field, full_name, name):
+    """Get all dict entries indexed by name that apply to full_name or name."""
+    # Transformers are indexed to full name, name, or no name
+    # as a performance optimization.
+    function_transformers = getattr(self._api_change_spec,
+                                    transformer_field, {})
+
+    glob_name = "*." + name if name else None
+    transformers = function_transformers.get("*", {}).copy()
+    transformers.update(function_transformers.get(glob_name, {}))
+    transformers.update(function_transformers.get(full_name, {}))
+    return transformers
+
+  def _get_full_name(self, node):
+    """Traverse an Attribute node to generate a full name, e.g., "tf.foo.bar".
+
+    This is the inverse of _full_name_node.
 
     Args:
       node: A Node of type Attribute.
 
     Returns:
-      a '.'-delimited full-name or None if the tree was not a simple form.
+      a '.'-delimited full-name or None if node was not Attribute or Name.
       i.e. `foo()+b).bar` returns None, while `a.b.c` would return "a.b.c".
     """
     curr = node
     items = []
     while not isinstance(curr, ast.Name):
       if not isinstance(curr, ast.Attribute):
-        return None, None
+        return None
       items.append(curr.attr)
       curr = curr.value
     items.append(curr.id)
-    return ".".join(reversed(items)), items[0]
+    return ".".join(reversed(items))
 
-  def _find_true_position(self, node):
-    """Return correct line number and column offset for a given node.
+  def _full_name_node(self, name, ctx=ast.Load()):
+    """Make an Attribute or Name node for name.
 
-    This is necessary mainly because ListComp's location reporting reports
-    the next token after the list comprehension list opening.
+    Translate a qualified name into nested Attribute nodes (and a Name node).
+
+    Args:
+      name: The name to translate to a node.
+      ctx: What context this name is used in. Defaults to Load()
 
     Returns:
-      lineno, offset for the given node
+      A Name or Attribute node.
+    """
+    names = name.split(".")
+    names.reverse()
+    node = ast.Name(id=names.pop(), ctx=ast.Load())
+    while names:
+      node = ast.Attribute(value=node, attr=names.pop(), ctx=ast.Load())
+
+    # Change outermost ctx to the one given to us (inner ones should be Load).
+    node.ctx = ctx
+    return node
+
+  def _maybe_add_warning(self, node, full_name):
+    """Adds an error to be printed about full_name at node."""
+    function_warnings = self._api_change_spec.function_warnings
+    if full_name in function_warnings:
+      warning_message = function_warnings[full_name]
+      warning_message = warning_message.replace("<function name>", full_name)
+      self.add_error(node.lineno, node.col_offset,
+                     "%s requires manual check: %s." % (full_name,
+                                                        warning_message))
+      return True
+    else:
+      return False
+
+  def _maybe_add_call_warning(self, node, full_name, name):
+    """Print a warning when specific functions are called with selected args.
+
+    The function _print_warning_for_function matches the full name of the called
+    function, e.g., tf.foo.bar(). This function matches the function name that
+    is called, as long as the function is an attribute. For example,
+    `tf.foo.bar()` and `foo.bar()` are matched, but not `bar()`.
 
     Args:
-      node: Node for which we wish to know the lineno and col_offset
+      node: ast.Call object
+      full_name: The precomputed full name of the callable, if one exists, None
+        otherwise.
+      name: The precomputed name of the callable, if one exists, None otherwise.
+
+    Returns:
+      Whether an error was recorded.
     """
-    if isinstance(node, ast.ListComp):
-      # Strangely, ast.ListComp returns the col_offset of the first token
-      # after the '[' token which appears to be a bug. Workaround by
-      # explicitly finding the real start of the list comprehension.
-      line = node.lineno
-      col = node.col_offset
-      # loop over lines
-      while 1:
-        # Reverse the text to and regular expression search for whitespace
-        text = self._lines[line - 1]
-        reversed_preceding_text = text[:col][::-1]
-        # First find if a [ can be found with only whitespace between it and
-        # col.
-        m = FIND_OPEN.match(reversed_preceding_text)
-        if m:
-          new_col_offset = col - m.start(1) - 1
-          return line, new_col_offset
+    # Only look for *.-warnings here, the other will be handled by the Attribute
+    # visitor. Also, do not warn for bare functions, only if the call func is
+    # an attribute.
+    warned = False
+    if isinstance(node.func, ast.Attribute):
+      warned = self._maybe_add_warning(node, "*." + name)
+
+    # All arg warnings are handled here, since only we have the args
+    arg_warnings = self._get_applicable_dict("function_arg_warnings",
+                                             full_name, name)
+
+    used_args = [kw.arg for kw in node.keywords]
+    for (kwarg, arg), warning in arg_warnings.items():
+      if kwarg in used_args or len(node.args) > arg:
+        warned = True
+        warning_message = warning.replace("<function name>", full_name or name)
+        self.add_error(node.lineno, node.col_offset,
+                       "%s called with %s argument requires manual check: %s." %
+                       (full_name or name, kwarg, warning_message))
+
+    return warned
+
+  def _maybe_rename(self, parent, node, full_name):
+    """Replace node (Attribute or Name) with a node representing full_name."""
+    new_name = self._api_change_spec.symbol_renames.get(full_name, None)
+    if new_name:
+      self.add_log(node.lineno, node.col_offset,
+                   "Renamed %r to %r" % (full_name, new_name))
+      new_node = self._full_name_node(new_name, node.ctx)
+      ast.copy_location(new_node, node)
+      pasta.ast_utils.replace_child(parent, node, new_node)
+      return True
+    else:
+      return False
+
+  def _maybe_change_to_function_call(self, parent, node, full_name):
+    """Wraps node (typically, an Attribute or Expr) in a Call."""
+    if full_name in self._api_change_spec.change_to_function:
+      if not isinstance(parent, ast.Call):
+        # ast.Call's constructor is really picky about how many arguments it
+        # wants, and also, it changed between Py2 and Py3.
+        if six.PY2:
+          new_node = ast.Call(node, [], [], None, None)
+        else:
+          new_node = ast.Call(node, [], [])
+        pasta.ast_utils.replace_child(parent, node, new_node)
+        ast.copy_location(new_node, node)
+        self.add_log(node.lineno, node.col_offset,
+                     "Changed %r to a function call" % full_name)
+        return True
+    return False
+
+  def _maybe_add_arg_names(self, node, full_name):
+    """Make args into keyword args if function called full_name requires it."""
+    function_reorders = self._api_change_spec.function_reorders
+
+    if full_name in function_reorders:
+      reordered = function_reorders[full_name]
+      new_keywords = []
+      for idx, arg in enumerate(node.args):
+        keyword_arg = reordered[idx]
+        new_keywords.append(ast.keyword(arg=keyword_arg, value=arg))
+
+      if new_keywords:
+        self.add_log(node.lineno, node.col_offset,
+                     "Added keywords to args of function %r" % full_name)
+        node.args = []
+        node.keywords = new_keywords + (node.keywords or [])
+        return True
+    return False
+
+  def _maybe_modify_args(self, node, full_name, name):
+    """Rename keyword args if the function called full_name requires it."""
+    renamed_keywords = self._get_applicable_dict("function_keyword_renames",
+                                                 full_name, name)
+
+    if not renamed_keywords:
+      return False
+
+    modified = False
+    new_keywords = []
+    for keyword in node.keywords:
+      argkey = keyword.arg
+      if argkey in renamed_keywords:
+        modified = True
+        if renamed_keywords[argkey] is None:
+          lineno = getattr(keyword, "lineno", node.lineno)
+          col_offset = getattr(keyword, "col_offset", node.col_offset)
+          self.add_log(lineno, col_offset,
+                       "Removed argument %s for function %s" % (
+                           argkey, full_name or name))
         else:
-          if (reversed_preceding_text == "" or
-              reversed_preceding_text.isspace()):
-            line = line - 1
-            prev_line = self._lines[line - 1]
-            # TODO(aselle):
-            # this is poor comment detection, but it is good enough for
-            # cases where the comment does not contain string literal starting/
-            # ending characters. If ast gave us start and end locations of the
-            # ast nodes rather than just start, we could use string literal
-            # node ranges to filter out spurious #'s that appear in string
-            # literals.
-            comment_start = prev_line.find("#")
-            if comment_start == -1:
-              col = len(prev_line) - 1
-            elif FIND_STRING_CHARS.search(prev_line[comment_start:]) is None:
-              col = comment_start
-            else:
-              return None, None
-          else:
-            return None, None
-    # Most other nodes return proper locations (with notably does not), but
-    # it is not possible to use that in an argument.
-    return node.lineno, node.col_offset
+          keyword.arg = renamed_keywords[argkey]
+          lineno = getattr(keyword, "lineno", node.lineno)
+          col_offset = getattr(keyword, "col_offset", node.col_offset)
+          self.add_log(lineno, col_offset,
+                       "Renamed keyword argument for %s from %s to %s" % (
+                           full_name, argkey, renamed_keywords[argkey]))
+          new_keywords.append(keyword)
+      else:
+        new_keywords.append(keyword)
+
+    if modified:
+      node.keywords = new_keywords
+    return modified
 
   def visit_Call(self, node):  # pylint: disable=invalid-name
     """Handle visiting a call node in the AST.
@@ -309,104 +341,74 @@ class _ASTCallVisitor(ast.NodeVisitor):
     Args:
       node: Current Node
     """
-    self._print_warning_for_function_unrestricted(node)
-
-    # Find a simple attribute name path e.g. "tf.foo.bar"
-    full_name, name = self._get_attribute_full_path(node.func)
-
-    # Make sure the func is marked as being part of a call
-    node.func.is_function_for_call = True
+    assert self._stack[-1] is node
 
+    # Get the name for this call, so we can index stuff with it.
+    full_name = self._get_full_name(node.func)
     if full_name:
-      # Call special handlers
-      function_handles = self._api_change_spec.function_handle
-      glob_name = "*.{}".format(name)
-      if glob_name in function_handles:
-        function_handles[glob_name](self._file_edit, node, self._lines)
-      if full_name in function_handles:
-        function_handles[full_name](self._file_edit, node, self._lines)
-
-      # Examine any non-keyword argument and make it into a keyword argument
-      # if reordering required.
-      function_reorders = self._api_change_spec.function_reorders
-      function_keyword_renames = (
-          self._api_change_spec.function_keyword_renames)
-
-      if full_name in function_reorders:
-        reordered = function_reorders[full_name]
-        for idx, arg in enumerate(node.args):
-          lineno, col_offset = self._find_true_position(arg)
-          if lineno is None or col_offset is None:
-            self._file_edit.add(
-                "Failed to add keyword %r to reordered function %r" %
-                (reordered[idx], full_name),
-                arg.lineno,
-                arg.col_offset,
-                "",
-                "",
-                error="A necessary keyword argument failed to be inserted.")
-          else:
-            keyword_arg = reordered[idx]
-            if (full_name in function_keyword_renames and
-                keyword_arg in function_keyword_renames[full_name]):
-              keyword_arg = function_keyword_renames[full_name][keyword_arg]
-            self._file_edit.add("Added keyword %r to reordered function %r" %
-                                (reordered[idx], full_name), lineno, col_offset,
-                                "", keyword_arg + "=")
-
-      # Examine each keyword argument and convert it to the final renamed form
-      renamed_keywords = ({} if full_name not in function_keyword_renames else
-                          function_keyword_renames[full_name])
-      for keyword in node.keywords:
-        argkey = keyword.arg
-        argval = keyword.value
-
-        if argkey in renamed_keywords:
-          argval_lineno, argval_col_offset = self._find_true_position(argval)
-          if argval_lineno is not None and argval_col_offset is not None:
-            # TODO(aselle): We should scan backward to find the start of the
-            # keyword key. Unfortunately ast does not give you the location of
-            # keyword keys, so we are forced to infer it from the keyword arg
-            # value.
-            key_start = argval_col_offset - len(argkey) - 1
-            key_end = key_start + len(argkey) + 1
-            if (self._lines[argval_lineno - 1][key_start:key_end] == argkey +
-                "="):
-              self._file_edit.add("Renamed keyword argument from %r to %r" %
-                                  (argkey,
-                                   renamed_keywords[argkey]), argval_lineno,
-                                  argval_col_offset - len(argkey) - 1,
-                                  argkey + "=", renamed_keywords[argkey] + "=")
-              continue
-          self._file_edit.add(
-              "Failed to rename keyword argument from %r to %r" %
-              (argkey, renamed_keywords[argkey]),
-              argval.lineno,
-              argval.col_offset - len(argkey) - 1,
-              "",
-              "",
-              error="Failed to find keyword lexographically. Fix manually.")
-
-    ast.NodeVisitor.generic_visit(self, node)
+      name = full_name.split(".")[-1]
+    elif isinstance(node.func, ast.Name):
+      name = node.func.id
+    elif isinstance(node.func, ast.Attribute):
+      name = node.func.attr
+    else:
+      name = None
+
+    # Call standard transformers for this node.
+    # Make sure warnings come first, since args or names triggering warnings
+    # may be removed by the other transformations.
+    self._maybe_add_call_warning(node, full_name, name)
+    # Make all args into kwargs
+    self._maybe_add_arg_names(node, full_name)
+    # Argument name changes or deletions
+    self._maybe_modify_args(node, full_name, name)
+
+    # Call transformers. These have the ability to modify the node, and if they
+    # do, will return the new node they created (or the same node if they just
+    # changed it). The are given the parent, but we will take care of
+    # integrating their changes into the parent if they return a new node.
+    #
+    # These are matched on the old name, since renaming is performed by the
+    # Attribute visitor, which happens later.
+    transformers = self._get_applicable_entries("function_transformers",
+                                                full_name, name)
+
+    parent = self._stack[-2]
+
+    for transformer in transformers:
+      logs = []
+      errors = []
+      new_node = transformer(parent, node, full_name, name, logs, errors)
+      self.add_logs(logs)
+      self.add_errors(errors)
+      if new_node:
+        if new_node is not node:
+          pasta.ast_utils.replace_child(parent, node, new_node)
+          node = new_node
+          self._stack[-1] = node
+
+    self.generic_visit(node)
 
   def visit_Attribute(self, node):  # pylint: disable=invalid-name
-    """Handle bare Attributes i.e. [tf.foo, tf.bar].
+    """Handle bare Attributes i.e. [tf.foo, tf.bar]."""
+    assert self._stack[-1] is node
 
-    Args:
-      node: Node that is of type ast.Attribute
-    """
-    full_name, _ = self._get_attribute_full_path(node)
+    full_name = self._get_full_name(node)
     if full_name:
+      parent = self._stack[-2]
+
       # Make sure the warning comes first, otherwise the name may have changed
-      self._print_warning_for_function(node, full_name)
-      self._rename_functions(node, full_name)
-    if full_name in self._api_change_spec.change_to_function:
-      if not hasattr(node, "is_function_for_call"):
-        new_text = full_name + "()"
-        self._file_edit.add("Changed %r to %r" % (full_name, new_text),
-                            node.lineno, node.col_offset, full_name, new_text)
+      self._maybe_add_warning(node, full_name)
+
+      # Once we did a modification, node is invalid and not worth inspecting
+      # further. Also, we only perform modifications for simple nodes, so
+      # There'd be no point in descending further.
+      if self._maybe_rename(parent, node, full_name):
+        return
+      if self._maybe_change_to_function_call(parent, node, full_name):
+        return
 
-    ast.NodeVisitor.generic_visit(self, node)
+    self.generic_visit(node)
 
 
 class ASTCodeUpgrader(object):
@@ -429,16 +431,42 @@ class ASTCodeUpgrader(object):
     """
 
     # Write to a temporary file, just in case we are doing an implace modify.
+    # pylint: disable=g-backslash-continuation
     with open(in_filename, "r") as in_file, \
         tempfile.NamedTemporaryFile("w", delete=False) as temp_file:
       ret = self.process_opened_file(in_filename, in_file, out_filename,
                                      temp_file)
+    # pylint: enable=g-backslash-continuation
 
     shutil.move(temp_file.name, out_filename)
     return ret
 
-  # Broad exceptions are required here because ast throws whatever it wants.
-  # pylint: disable=broad-except
+  def _format_errors(self, errors, in_filename):
+    return ["%s:%d:%d: %s" % ((in_filename,) + error) for error in errors]
+
+  def update_string_pasta(self, text, in_filename):
+    """Updates a file using pasta."""
+    try:
+      t = pasta.parse(text)
+    except (SyntaxError, ValueError, TypeError):
+      log = "Failed to parse.\n\n" + traceback.format_exc()
+      return 0, "", log, []
+
+    visitor = _PastaEditVisitor(self._api_change_spec)
+    visitor.visit(t)
+
+    errors = self._format_errors(visitor.errors, in_filename)
+    return 1, pasta.dump(t), visitor.log_text(), errors
+
+  def _format_log(self, log, in_filename, out_filename):
+    text = "-" * 80 + "\n"
+    text += "Processing file %r\n outputting to %r\n" % (in_filename,
+                                                         out_filename)
+    text += "-" * 80 + "\n\n"
+    text += log
+    text += "-" * 80 + "\n\n"
+    return text
+
   def process_opened_file(self, in_filename, in_file, out_filename, out_file):
     """Process the given python file for incompatible changes.
 
@@ -453,33 +481,19 @@ class ASTCodeUpgrader(object):
     Returns:
       A tuple representing number of files processed, log of actions, errors
     """
-    process_errors = []
-    text = "-" * 80 + "\n"
-    text += "Processing file %r\n outputting to %r\n" % (in_filename,
-                                                         out_filename)
-    text += "-" * 80 + "\n\n"
-
-    parsed_ast = None
     lines = in_file.readlines()
-    try:
-      parsed_ast = ast.parse("".join(lines))
-    except Exception:
-      text += "Failed to parse %r\n\n" % in_filename
-      text += traceback.format_exc()
-    if parsed_ast:
-      visitor = _ASTCallVisitor(in_filename, lines, self._api_change_spec)
-      visitor.visit(parsed_ast)
-      out_text, new_text, process_errors = visitor.process(lines)
-      text += new_text
-      if out_file:
-        out_file.write(out_text)
-    text += "\n"
-    return 1, text, process_errors
-
-  # pylint: enable=broad-except
+    processed_file, new_file_content, log, process_errors = (
+        self.update_string_pasta("".join(lines), in_filename))
+
+    if out_file and processed_file:
+      out_file.write(new_file_content)
+
+    return (processed_file,
+            self._format_log(log, in_filename, out_filename),
+            process_errors)
 
   def process_tree(self, root_directory, output_root_directory,
-                   copy_other_files):
+                   copy_other_files, in_place):
     """Processes upgrades on an entire tree of python files in place.
 
     Note that only Python files. If you have custom code in other languages,
@@ -489,11 +503,20 @@ class ASTCodeUpgrader(object):
       root_directory: Directory to walk and process.
       output_root_directory: Directory to use as base.
       copy_other_files: Copy files that are not touched by this converter.
+      in_place: Allow the conversion of an entire directory in place.
 
     Returns:
       A tuple of files processed, the report string ofr all files, and errors
     """
 
+    if output_root_directory == root_directory:
+      if in_place:
+        return self.process_tree_inplace(root_directory)
+      else:
+        print("In order to copy a directory in place the `--inplace` input "
+              "arg must be set to `True`.")
+        sys.exit(1)
+
     # make sure output directory doesn't exist
     if output_root_directory and os.path.exists(output_root_directory):
       print("Output directory %r must not already exist." %
@@ -550,3 +573,26 @@ class ASTCodeUpgrader(object):
         os.makedirs(output_directory)
       shutil.copy(input_path, output_path)
     return file_count, report, tree_errors
+
+  def process_tree_inplace(self, root_directory):
+    """Process a directory of python files in place."""
+    files_to_process = []
+    for dir_name, _, file_list in os.walk(root_directory):
+      py_files = [os.path.join(dir_name,
+                               f) for f in file_list if f.endswith(".py")]
+      files_to_process += py_files
+
+    file_count = 0
+    tree_errors = []
+    report = ""
+    report += ("=" * 80) + "\n"
+    report += "Input tree: %r\n" % root_directory
+    report += ("=" * 80) + "\n"
+
+    for path in files_to_process:
+      file_count += 1
+      _, l_report, l_errors = self.process_file(path, path)
+      tree_errors += l_errors
+      report += l_report
+
+    return file_count, report, tree_errors
diff --git a/tensorflow/tools/compatibility/ast_edits_test.py b/tensorflow/tools/compatibility/ast_edits_test.py
index 99f20a026fcb9b60e0d4365dd2690946f0d833fc..4accd8fe2957374914dfad0b95d2f316d37d53f5 100644
--- a/tensorflow/tools/compatibility/ast_edits_test.py
+++ b/tensorflow/tools/compatibility/ast_edits_test.py
@@ -54,7 +54,6 @@ class NoUpdateSpec(ast_edits.APIChangeSpec):
     self.function_keyword_renames = {}
     self.symbol_renames = {}
     self.function_warnings = {}
-    self.unrestricted_function_warnings = {}
     self.change_to_function = {}
 
 
@@ -401,7 +400,8 @@ class TestAstEdits(test_util.TensorFlowTestCase):
 
       def __init__(self):
         NoUpdateSpec.__init__(self)
-        self.unrestricted_function_warnings = {"foo": "not good"}
+        self.function_warnings = {"*.foo": "not good"}
+
     texts = ["object.foo()", "get_object().foo()",
              "get_object().foo()", "object.foo().bar()"]
     for text in texts:
diff --git a/tensorflow/tools/compatibility/renames_v2.py b/tensorflow/tools/compatibility/renames_v2.py
index dba4a5d5a3f3c9f97f2ecea1179107d8cd9a9f72..b757699f636f2cbe7e6a0fb8ee9604ff5631d87f 100644
--- a/tensorflow/tools/compatibility/renames_v2.py
+++ b/tensorflow/tools/compatibility/renames_v2.py
@@ -34,6 +34,7 @@ renames = {
     'tf.ConfigProto': 'tf.compat.v1.ConfigProto',
     'tf.DeviceSpec': 'tf.compat.v1.DeviceSpec',
     'tf.Dimension': 'tf.compat.v1.Dimension',
+    'tf.Event': 'tf.compat.v1.Event',
     'tf.FIFOQueue': 'tf.queue.FIFOQueue',
     'tf.FixedLenFeature': 'tf.io.FixedLenFeature',
     'tf.FixedLenSequenceFeature': 'tf.io.FixedLenSequenceFeature',
@@ -73,6 +74,8 @@ renames = {
     'tf.SparseConditionalAccumulator': 'tf.sparse.SparseConditionalAccumulator',
     'tf.SparseFeature': 'tf.io.SparseFeature',
     'tf.SparseTensorValue': 'tf.compat.v1.SparseTensorValue',
+    'tf.Summary': 'tf.compat.v1.Summary',
+    'tf.SummaryMetadata': 'tf.compat.v1.SummaryMetadata',
     'tf.TFRecordReader': 'tf.compat.v1.TFRecordReader',
     'tf.TensorInfo': 'tf.compat.v1.TensorInfo',
     'tf.TextLineReader': 'tf.compat.v1.TextLineReader',
@@ -141,10 +144,11 @@ renames = {
     'tf.diag': 'tf.linalg.tensor_diag',
     'tf.diag_part': 'tf.linalg.tensor_diag_part',
     'tf.digamma': 'tf.math.digamma',
-    'tf.dimension_at_index': 'tf.compat.v1.dimension_at_index',
-    'tf.dimension_value': 'tf.compat.v1.dimension_value',
+    'tf.dimension_at_index': 'tf.compat.dimension_at_index',
+    'tf.dimension_value': 'tf.compat.dimension_value',
     'tf.disable_eager_execution': 'tf.compat.v1.disable_eager_execution',
     'tf.disable_resource_variables': 'tf.compat.v1.disable_resource_variables',
+    'tf.disable_v2_batch_normalization': 'tf.compat.v1.disable_v2_batch_normalization',
     'tf.disable_v2_behavior': 'tf.compat.v1.disable_v2_behavior',
     'tf.disable_v2_tensorshape': 'tf.compat.v1.disable_v2_tensorshape',
     'tf.distributions.Bernoulli': 'tf.compat.v1.distributions.Bernoulli',
@@ -168,6 +172,7 @@ renames = {
     'tf.div': 'tf.compat.v1.div',
     'tf.enable_eager_execution': 'tf.compat.v1.enable_eager_execution',
     'tf.enable_resource_variables': 'tf.compat.v1.enable_resource_variables',
+    'tf.enable_v2_batch_normalization': 'tf.compat.v1.enable_v2_batch_normalization',
     'tf.enable_v2_behavior': 'tf.compat.v1.enable_v2_behavior',
     'tf.enable_v2_tensorshape': 'tf.compat.v1.enable_v2_tensorshape',
     'tf.encode_base64': 'tf.io.encode_base64',
@@ -198,8 +203,8 @@ renames = {
     'tf.get_variable': 'tf.compat.v1.get_variable',
     'tf.get_variable_scope': 'tf.compat.v1.get_variable_scope',
     'tf.gfile.FastGFile': 'tf.compat.v1.gfile.FastGFile',
-    'tf.gfile.GFile': 'tf.compat.v1.gfile.GFile',
-    'tf.gfile.Open': 'tf.compat.v1.gfile.Open',
+    'tf.gfile.GFile': 'tf.io.gfile.GFile',
+    'tf.gfile.Open': 'tf.io.gfile.GFile',
     'tf.global_norm': 'tf.linalg.global_norm',
     'tf.global_variables': 'tf.compat.v1.global_variables',
     'tf.global_variables_initializer': 'tf.compat.v1.global_variables_initializer',
@@ -219,7 +224,6 @@ renames = {
     'tf.image.resize_area': 'tf.compat.v1.image.resize_area',
     'tf.image.resize_bicubic': 'tf.compat.v1.image.resize_bicubic',
     'tf.image.resize_bilinear': 'tf.compat.v1.image.resize_bilinear',
-    'tf.image.resize_images': 'tf.compat.v1.image.resize_images',
     'tf.image.resize_nearest_neighbor': 'tf.compat.v1.image.resize_nearest_neighbor',
     'tf.image.transpose_image': 'tf.compat.v1.image.transpose_image',
     'tf.initialize_all_tables': 'tf.compat.v1.initialize_all_tables',
@@ -607,8 +611,14 @@ renames = {
     'tf.string_strip': 'tf.strings.strip',
     'tf.string_to_hash_bucket_fast': 'tf.strings.to_hash_bucket_fast',
     'tf.string_to_hash_bucket_strong': 'tf.strings.to_hash_bucket_strong',
+    'tf.summary.Event': 'tf.compat.v1.summary.Event',
+    'tf.summary.FileWriter': 'tf.compat.v1.summary.FileWriter',
+    'tf.summary.FileWriterCache': 'tf.compat.v1.summary.FileWriterCache',
     'tf.summary.SessionLog': 'tf.compat.v1.summary.SessionLog',
     'tf.summary.audio': 'tf.compat.v1.summary.audio',
+    'tf.summary.Summary': 'tf.compat.v1.summary.Summary',
+    'tf.summary.SummaryDescription': 'tf.compat.v1.summary.SummaryDescription',
+    'tf.summary.TaggedRunMetadata': 'tf.compat.v1.summary.TaggedRunMetadata',
     'tf.summary.get_summary_description': 'tf.compat.v1.summary.get_summary_description',
     'tf.summary.histogram': 'tf.compat.v1.summary.histogram',
     'tf.summary.image': 'tf.compat.v1.summary.image',
@@ -712,6 +722,7 @@ renames = {
     'tf.train.slice_input_producer': 'tf.compat.v1.train.slice_input_producer',
     'tf.train.start_queue_runners': 'tf.compat.v1.train.start_queue_runners',
     'tf.train.string_input_producer': 'tf.compat.v1.train.string_input_producer',
+    'tf.train.summary_iterator': 'tf.compat.v1.train.summary_iterator',
     'tf.train.update_checkpoint_state': 'tf.compat.v1.train.update_checkpoint_state',
     'tf.train.warm_start': 'tf.compat.v1.train.warm_start',
     'tf.train.write_graph': 'tf.io.write_graph',
diff --git a/tensorflow/tools/compatibility/reorders_v2.py b/tensorflow/tools/compatibility/reorders_v2.py
index 3f05aea6cadeaf75ab57d43fe56d4a2d01b9ed3b..f9b0e3f9d8e6107701b01768b9674680d0e4b64a 100644
--- a/tensorflow/tools/compatibility/reorders_v2.py
+++ b/tensorflow/tools/compatibility/reorders_v2.py
@@ -31,6 +31,7 @@ reorders = {
     'tf.batch_gather': ['params', 'indices', 'name'],
     'tf.batch_to_space': ['input', 'crops', 'block_size', 'name'],
     'tf.boolean_mask': ['tensor', 'mask', 'name', 'axis'],
+    'tf.cond': ['pred', 'true_fn', 'false_fn', 'strict', 'name', 'fn1', 'fn2'],
     'tf.confusion_matrix': ['labels', 'predictions', 'num_classes', 'dtype', 'name', 'weights'],
     'tf.convert_to_tensor': ['value', 'dtype', 'name', 'preferred_dtype'],
     'tf.decode_csv': ['records', 'record_defaults', 'field_delim', 'use_quote_delim', 'name', 'na_value', 'select_cols'],
@@ -65,6 +66,7 @@ reorders = {
     'tf.nn.moments': ['x', 'axes', 'shift', 'name', 'keep_dims'],
     'tf.nn.pool': ['input', 'window_shape', 'pooling_type', 'padding', 'dilation_rate', 'strides', 'name', 'data_format'],
     'tf.nn.separable_conv2d': ['input', 'depthwise_filter', 'pointwise_filter', 'strides', 'padding', 'rate', 'name', 'data_format'],
+    'tf.nn.softmax_cross_entropy_with_logits': ['_sentinel', 'labels', 'logits', 'dim', 'name'],
     'tf.nn.space_to_batch': ['input', 'paddings', 'block_size', 'name'],
     'tf.nn.space_to_depth': ['input', 'block_size', 'name', 'data_format'],
     'tf.nn.weighted_moments': ['x', 'axes', 'frequency_weights', 'name', 'keep_dims'],
diff --git a/tensorflow/tools/compatibility/testdata/test_file_v1_12.py b/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
index 5ce4dd49adc940dbc56e19915a188cdb6b8de1d1..2663762aa70253f54037393c0cb3cd791a040d56 100644
--- a/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
+++ b/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
@@ -70,6 +70,15 @@ class TestUpgrade(test_util.TensorFlowTestCase):
         [0],
         tf.argmin([[1, 3, 2]], name='abc', dimension=1))
 
+  @test_util.run_v1_only("b/120545219")
+  def testSoftmaxCrossEntropyWithLogits(self):
+    out = tf.nn.softmax_cross_entropy_with_logits(
+        logits=[0.1, 0.8], labels=[0, 1])
+    self.assertAllClose(out, 0.40318608)
+    out = tf.nn.softmax_cross_entropy_with_logits_v2(
+        logits=[0.1, 0.8], labels=[0, 1])
+    self.assertAllClose(out, 0.40318608)
+
 
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/tools/compatibility/tf_upgrade.py b/tensorflow/tools/compatibility/tf_upgrade.py
index 68d2c02570797a90e644ac5110ea177b1f54896b..241b08510f6b1c7b62ab3563752b042bd1366f99 100644
--- a/tensorflow/tools/compatibility/tf_upgrade.py
+++ b/tensorflow/tools/compatibility/tf_upgrade.py
@@ -175,27 +175,13 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.op_scope": ["values", "name", "default_name"],
     }
 
-    # Specially handled functions.
-    self.function_handle = {"tf.reverse": self._reverse_handler}
-
     # Warnings that should be printed if corresponding functions are used.
-    self.function_warnings = {}
-
-  @staticmethod
-  def _reverse_handler(file_edit_recorder, node, lines):
-    del lines
-    # TODO(aselle): Could check for a literal list of bools and try to convert
-    # them to indices.
-    comment = ("ERROR: tf.reverse has had its argument semantics changed "
-               "significantly the converter cannot detect this reliably, so "
-               "you need to inspect this usage manually.\n")
-    file_edit_recorder.add(
-        comment,
-        node.lineno,
-        node.col_offset,
-        "tf.reverse",
-        "tf.reverse",
-        error="tf.reverse requires manual check.")
+    self.function_warnings = {
+        "tf.reverse":
+            "ERROR: tf.reverse has had its argument semantics changed "
+            "significantly. The converter cannot detect this reliably, so "
+            "you need to inspect this usage manually.\n",
+    }
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/tools/compatibility/tf_upgrade_test.py b/tensorflow/tools/compatibility/tf_upgrade_test.py
index 66325ea2ad36265c6c3779b414774abab8213a84..cf05575a9dd0cf6940a18e801fc76b667dbda233 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_test.py
@@ -112,7 +112,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     text = "tf.reverse(a, b)\n"
     _, unused_report, errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, new_text)
-    self.assertEqual(errors, ["test.py:1: tf.reverse requires manual check."])
+    self.assertIn("tf.reverse requires manual check", errors[0])
 
   def testListComprehension(self):
     def _test(input, output):  # pylint: disable=redefined-builtin
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py
index c0be267217d424648ccd4b19a8af3a4e68695375..2c56cbf36bd0c98e176241a9eeb7743d40327e1f 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py
@@ -18,7 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import re
+import ast
+
+import pasta
+import six
 
 from tensorflow.tools.compatibility import ast_edits
 from tensorflow.tools.compatibility import renames_v2
@@ -31,7 +34,27 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
   def __init__(self):
     # Maps from a function name to a dictionary that describes how to
     # map from an old argument keyword to the new argument keyword.
+    # If the new argument is None, it will be removed.
+    # Only keyword args are handled, so make sure to also put any function in
+    # function_reorders to ensure that all args are made into keywords first.
     self.function_keyword_renames = {
+        "tf.gradients": {
+            "colocate_gradients_with_ops": None,
+        },
+        "tf.hessians": {
+            "colocate_gradients_with_ops": None,
+        },
+        "*.minimize": {
+            "colocate_gradients_with_ops": None,
+        },
+        "*.compute_gradients": {
+            "colocate_gradients_with_ops": None,
+        },
+        "tf.cond": {
+            "strict": None,
+            "fn1": "true_fn",
+            "fn2": "false_fn"
+        },
         "tf.argmin": {
             "dimension": "axis",
         },
@@ -77,6 +100,10 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.convert_to_tensor": {
             "preferred_dtype": "dtype_hint"
         },
+        "tf.nn.softmax_cross_entropy_with_logits": {
+            "dim": "axis",
+            "_sentinel": None,
+        },
         "tf.nn.softmax_cross_entropy_with_logits_v2": {
             "dim": "axis"
         },
@@ -572,6 +599,8 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             "tf.compat.v1.initializers.random_normal",
         "tf.truncated_normal_initializer":
             "tf.compat.v1.initializers.truncated_normal",
+        "tf.image.resize_images":
+            "tf.image.resize",
     }
     # pylint: enable=line-too-long
 
@@ -596,6 +625,7 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.argmin",
         "tf.batch_gather",
         "tf.batch_to_space",
+        "tf.cond",
         "tf.nn.space_to_batch",
         "tf.boolean_mask",
         "tf.convert_to_tensor",
@@ -665,6 +695,10 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.norm",
         "tf.reverse_sequence",
         "tf.sparse_split",
+        # tf.nn.softmax_cross_entropy_with_logits *must* be called with
+        # keyword arguments. Add keyword arguments in rare case when they
+        # are not specified.
+        "tf.nn.softmax_cross_entropy_with_logits",
     }
 
     # Functions that were reordered should be changed to the new keyword args
@@ -672,14 +706,38 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
     # positional arguments yourself, this could do the wrong thing.
     self.function_reorders = reorders_v2.reorders
 
-    # Specially handled functions.
-    self.function_handle = {
-        "tf.batch_gather": self._batch_gather_handler,
-        "tf.nn.dropout": self._dropout_handler,
-        "tf.gradients": self._colocate_handler("tf.gradients"),
-        "*.minimize": self._colocate_handler("Optimizer.minimize"),
-        "*.compute_gradients":
-            self._colocate_handler("Optimizer.compute_gradients"),
+    # Specially handled functions (pasta version)
+    # Each transformer is a callable which will be called with the arguments
+    #   transformer(parent, node, full_name, name, logs, errors)
+    # Where logs and errors are lists to which (line, col, msg) tuples can be
+    # appended, full_name is the FQN of the function called (or None if that is
+    # unknown), name is the name of the function called (or None is that is
+    # unknown). node is an ast.Call node representing this function call, and
+    # parent is its parent in the AST.
+    # The function may modify node (but not parent), and must return
+    # - none, if nothing was modified
+    # - node, if node was modified in place (make sure to use
+    #   pasta.ast_utils.replace_child to swap out children, otherwise formatting
+    #   may get messy)
+    # - a replacement for node, if the whole call node was replaced. The caller
+    #   will take care of changing parent.
+    self.function_transformers = {
+        "tf.nn.dropout": self._dropout_transformer,
+        "tf.batch_gather": self._batch_gather_transformer,
+        "tf.to_bfloat16": self._cast_transformer,
+        "tf.to_complex128": self._cast_transformer,
+        "tf.to_complex64": self._cast_transformer,
+        "tf.to_double": self._cast_transformer,
+        "tf.to_float": self._cast_transformer,
+        "tf.to_int32": self._cast_transformer,
+        "tf.to_int64": self._cast_transformer,
+        "tf.nn.softmax_cross_entropy_with_logits":
+            self._softmax_cross_entropy_with_logits_transformer,
+        "tf.image.resize_area": self._image_resize_transformer,
+        "tf.image.resize_bicubic": self._image_resize_transformer,
+        "tf.image.resize_bilinear": self._image_resize_transformer,
+        "tf.image.resize_nearest_neighbor": self._image_resize_transformer,
+
     }
 
     decay_function_comment = (
@@ -736,9 +794,57 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         " tf.initializers.variance_scaling instead with distribution=uniform "
         "to get equivalent behaviour.")
 
+    metrics_comment = (
+        "WARNING: tf.metrics have been converted to object oriented versions in"
+        " TF 2.0 and after. The metric function calls have been converted to "
+        "compat.v1 for backward compatibility. Please update these calls to "
+        "the TF 2.0 versions.")
+
+    losses_comment = (
+        "WARNING: tf.losses have been converted to object oriented versions in"
+        " TF 2.0 and after. The loss function calls have been converted to "
+        "compat.v1 for backward compatibility. Please update these calls to "
+        "the TF 2.0 versions.")
+
+    export_saved_model_renamed = (
+        "(Manual edit required) Please rename the method export_savedmodel() "
+        "to export_saved_model(). Two things to note:\n\t(1) The argument "
+        "strip_default_attributes has been removed. The function will always "
+        "strip the default attributes from ops. If this breaks your code, "
+        "please switch to tf.compat.v1.estimator.Estimator.\n\t(2) This change "
+        "only effects core estimator. If you are using "
+        "tf.contrib.learn.Estimator, please switch to using core estimator.")
+
+    make_initializable_iterator_deprecation = (
+        "(Manual edit required) The "
+        "`tf.data.Dataset.make_initializable_iterator()` method has been "
+        "removed. If you are using the Estimator API, you can return a dataset "
+        "directly from your input functions without creating an iterator. "
+        "As a last resort, please replace calls to that method on `dataset` "
+        "with a call to "
+        "`tf.compat.v1.data.make_initializable_iterator(dataset)`.")
+
+    make_one_shot_iterator_deprecation = (
+        "(Manual edit required) The "
+        "`tf.data.Dataset.make_one_shot_iterator()` method has been "
+        "removed. If you are using eager execution, you can iterate over "
+        "`dataset` using a Python `for` loop. If you are using the Estimator "
+        "API, you can return a dataset directly from your input functions "
+        "without creating an iterator. As a last resort, please replace calls "
+        "to that method on `dataset` with a call to "
+        "`tf.compat.v1.data.make_one_shot_iterator(dataset)`.")
+
     # Function warnings. <function name> placeholder inside warnings will be
     # replaced by function name.
+    # You can use *. to add items which do not check the FQN, and apply to e.g.,
+    # methods.
     self.function_warnings = {
+        "*.export_savedmodel":
+            export_saved_model_renamed,
+        "*.make_initializable_iterator":
+            make_initializable_iterator_deprecation,
+        "*.make_one_shot_iterator":
+            make_one_shot_iterator_deprecation,
         "tf.assert_greater":
             assert_return_type_comment,
         "tf.assert_equal":
@@ -747,9 +853,6 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             assert_return_type_comment,
         "tf.assert_rank":
             assert_rank_comment,
-        "tf.cond": "tf.cond no longer takes 'strict'. "
-                   "Now 'strict' defaults to True."
-                   "fn1/fn2 arguments are replaced by true_fn/false_fn.",
         "tf.debugging.assert_equal":
             assert_return_type_comment,
         "tf.debugging.assert_greater":
@@ -780,9 +883,10 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             assert_rank_comment,
         "tf.debugging.assert_rank_in":
             assert_rank_comment,
-        "tf.device": "tf.device no longer takes function as an argument. "
-                     "'devide_name_or_function' argument has been renamed to "
-                     "'device_name'.",
+        "tf.device":
+            "tf.device no longer takes function as an argument. "
+            "'devide_name_or_function' argument has been renamed to "
+            "'device_name'.",
         "tf.flags":
             "tf.flags has been removed, please use the argparse or absl"
             " module if you need command line parsing.",
@@ -820,10 +924,6 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             default_loss_reduction_changed,
         "tf.estimator.BaselineRegressor":
             default_loss_reduction_changed,
-        "tf.hessians": "tf.hessians no longer takes "
-                       "'colocate_gradients_with_ops' argument. Also, "
-                       "arguments have been reordered so that 'name' is the "
-                       "last argument.",
         "tf.nn.conv1d":
             "WARNING: use_cudnn_on_gpu argument has been removed and \"value\""
             " was renamed to \"input\"",
@@ -870,10 +970,6 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             "'deterministic' arguments. Now it takes a single 'seed' arg. If "
             "'seed' is zero, the execution is random and deterministic "
             "otherwise",
-        "tf.nn.softmax_cross_entropy_with_logits":
-            "tf.nn.softmax_cross_entropy_with_logits behavior has changed. "
-            "'labels' needs to be wrapped with tf.stop_gradient to keep the "
-            "old behavior. Also, 'dim' argument has been renamed to 'axis'.",
         "tf.test.assert_equal_graph_def":
             "tf.assert_equal_graph_def no longer takes 'checkpoint_v2' "
             "argument. 'checkpoint_v2' now defaults to True.",
@@ -949,6 +1045,131 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             uniform_unit_scaling_initializer_comment,
         "tf.uniform_unit_scaling_initializer":
             uniform_unit_scaling_initializer_comment,
+        "tf.losses.absolute_difference":
+            losses_comment,
+        "tf.losses.add_loss":
+            losses_comment,
+        "tf.losses.compute_weighted_loss":
+            losses_comment,
+        "tf.losses.cosine_distance":
+            losses_comment,
+        "tf.losses.get_losses":
+            losses_comment,
+        "tf.losses.get_regularization_loss":
+            losses_comment,
+        "tf.losses.get_regularization_losses":
+            losses_comment,
+        "tf.losses.get_total_loss":
+            losses_comment,
+        "tf.losses.hinge_loss":
+            losses_comment,
+        "tf.losses.huber_loss":
+            losses_comment,
+        "tf.losses.log_loss":
+            losses_comment,
+        "tf.losses.mean_pairwise_squared_error":
+            losses_comment,
+        "tf.losses.mean_squared_error":
+            losses_comment,
+        "tf.losses.sigmoid_cross_entropy":
+            losses_comment,
+        "tf.losses.softmax_cross_entropy":
+            losses_comment,
+        "tf.losses.sparse_softmax_cross_entropy":
+            losses_comment,
+        "tf.metrics.accuracy":
+            metrics_comment,
+        "tf.metrics.auc":
+            metrics_comment,
+        "tf.metrics.average_precision_at_k":
+            metrics_comment,
+        "tf.metrics.false_negatives":
+            metrics_comment,
+        "tf.metrics.false_negatives_at_thresholds":
+            metrics_comment,
+        "tf.metrics.false_positives":
+            metrics_comment,
+        "tf.metrics.false_positives_at_thresholds":
+            metrics_comment,
+        "tf.metrics.mean":
+            metrics_comment,
+        "tf.metrics.mean_absolute_error":
+            metrics_comment,
+        "tf.metrics.mean_cosine_distance":
+            metrics_comment,
+        "tf.metrics.mean_iou":
+            metrics_comment,
+        "tf.metrics.mean_per_class_accuracy":
+            metrics_comment,
+        "tf.metrics.mean_relative_error":
+            metrics_comment,
+        "tf.metrics.mean_squared_error":
+            metrics_comment,
+        "tf.metrics.mean_tensor":
+            metrics_comment,
+        "tf.metrics.percentage_below":
+            metrics_comment,
+        "tf.metrics.precision":
+            metrics_comment,
+        "tf.metrics.precision_at_k":
+            metrics_comment,
+        "tf.metrics.precision_at_thresholds":
+            metrics_comment,
+        "tf.metrics.precision_at_top_k":
+            metrics_comment,
+        "tf.metrics.recall":
+            metrics_comment,
+        "tf.metrics.recall_at_k":
+            metrics_comment,
+        "tf.metrics.recall_at_thresholds":
+            metrics_comment,
+        "tf.metrics.recall_at_top_k":
+            metrics_comment,
+        "tf.metrics.root_mean_squared_error":
+            metrics_comment,
+        "tf.metrics.sensitivity_at_specificity":
+            metrics_comment,
+        "tf.metrics.sparse_average_precision_at_k":
+            metrics_comment,
+        "tf.metrics.sparse_precision_at_k":
+            metrics_comment,
+        "tf.metrics.specificity_at_sensitivity":
+            metrics_comment,
+        "tf.metrics.true_negatives":
+            metrics_comment,
+        "tf.metrics.true_negatives_at_thresholds":
+            metrics_comment,
+        "tf.metrics.true_positives":
+            metrics_comment,
+        "tf.metrics.true_positives_at_thresholds":
+            metrics_comment,
+    }
+
+    # Warnings that are emitted only if a specific arg is found.
+    self.function_arg_warnings = {
+        "tf.gradients": {
+            ("colocate_gradients_with_ops", 4):
+                "tf.gradients no longer takes "
+                "'colocate_gradients_with_ops' argument, it behaves as if it "
+                "was set to True.",
+        },
+        "*.minimize": {
+            ("colocate_gradients_with_ops", 5):
+                "Optimizer.minimize no longer takes "
+                "'colocate_gradients_with_ops' argument, it behaves as if it "
+                "was set to True.",
+        },
+        "*.compute_gradients": {
+            ("colocate_gradients_with_ops", 4):
+                "Optimizer.compute_gradients no "
+                "longer takes 'colocate_gradients_with_ops' argument, it "
+                "behaves as if it was set to True.",
+        },
+        "tf.cond": {
+            ("strict", 3):
+                "tf.cond no longer takes 'strict' argument, it behaves as "
+                "if was set to True."
+        },
     }
 
     self.symbol_renames = {
@@ -956,106 +1177,176 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         for name, new_name in self.symbol_renames.items()
     }
 
-    export_saved_model_renamed = (
-        "(Manual edit required) Please rename the method export_savedmodel() "
-        "to export_saved_model(). Two things to note:\n\t(1) The argument "
-        "strip_default_attributes has been removed. The function will always "
-        "strip the default attributes from ops. If this breaks your code, "
-        "please switch to tf.compat.v1.estimator.Estimator.\n\t(2) This change "
-        "only effects core estimator. If you are using "
-        "tf.contrib.learn.Estimator, please switch to using core estimator.")
-
-    make_initializable_iterator_deprecation = (
-        "(Manual edit required) The "
-        "`tf.data.Dataset.make_initializable_iterator()` method has been "
-        "removed. If you are using the Estimator API, you can return a dataset "
-        "directly from your input functions without creating an iterator. "
-        "As a last resort, please replace calls to that method on `dataset` "
-        "with a call to "
-        "`tf.compat.v1.data.make_initializable_iterator(dataset)`.")
+  @staticmethod
+  def _dropout_transformer(parent, node, full_name, name, logs, errors):
+    def _replace_keep_prob_node(parent, old_value):
+      """Replaces old_value with 1-(old_value)."""
+      one = ast.Num(n=1)
+      one.lineno = 0
+      one.col_offset = 0
+      new_value = ast.BinOp(left=one, op=ast.Sub(),
+                            right=old_value)
+      # This copies the prefix and suffix on old_value to new_value.
+      pasta.ast_utils.replace_child(parent, old_value, new_value)
+      ast.copy_location(new_value, old_value)
+      # Put parentheses around keep_prob.value (and remove the old prefix/
+      # suffix, they should only be around new_value).
+      pasta.base.formatting.set(old_value, "prefix", "(")
+      pasta.base.formatting.set(old_value, "suffix", ")")
 
-    make_one_shot_iterator_deprecation = (
-        "(Manual edit required) The "
-        "`tf.data.Dataset.make_one_shot_iterator()` method has been "
-        "removed. If you are using eager execution, you can iterate over "
-        "`dataset` using a Python `for` loop. If you are using the Estimator "
-        "API, you can return a dataset directly from your input functions "
-        "without creating an iterator. As a last resort, please replace calls "
-        "to that method on `dataset` with a call to "
-        "`tf.compat.v1.data.make_one_shot_iterator(dataset)`.")
+    # Check if we have a keep_prob keyword arg
+    for keep_prob in node.keywords:
+      if keep_prob.arg == "keep_prob":
+        logs.append((node.lineno, node.col_offset,
+                     "Changing keep_prob arg of tf.nn.dropout to rate, and "
+                     "recomputing value. Please check this transformation.\n"))
+        keep_prob.arg = "rate"
+        _replace_keep_prob_node(keep_prob, keep_prob.value)
+        return node
 
-    # Specify warnings for functions that aren't restricted to the tf.x.y.z
-    # format. This should only be used for methods with unique names, e.g.
-    # export_savedmodel, which is only defined in Estimator objects.
-    self.unrestricted_function_warnings = {
-        "export_savedmodel": export_saved_model_renamed,
-        "make_initializable_iterator": make_initializable_iterator_deprecation,
-        "make_one_shot_iterator": make_one_shot_iterator_deprecation,
-    }
+    # Maybe it was a positional arg
+    if len(node.args) < 2:
+      errors.append((node.lineno, node.col_offset,
+                     "ERROR: tf.nn.dropout called without arguments, so "
+                     "automatic fix was disabled. tf.nn.dropout has changed "
+                     "the semantics of the second argument."))
+    else:
+      _replace_keep_prob_node(node, node.args[1])
+      logs.append((node.lineno, node.col_offset,
+                   "Changing keep_prob arg of tf.nn.dropout to rate, and "
+                   "recomputing value.\n"))
+      errors.append((node.lineno, node.col_offset,
+                     "WARNING: tf.nn.dropout has changed the semantics of the "
+                     "second argument. Please check the applied transformation."
+                    ))
+      return node
 
   @staticmethod
-  def _dropout_handler(file_edit_recorder, node, lines):
-    del lines
-    if len(node.args) < 2:
-      comment = ("ERROR: tf.nn.dropout did not take arguments, so automatic "
-                 "transformation was disabled. tf.nn.dropout has changed "
-                 "the semantics of the second argument.")
-      file_edit_recorder.add(
-          comment,
-          node.lineno,
-          node.col_offset,
-          "tf.nn.dropout",
-          "tf.nn.dropout",
-          error="tf.nn.dropout requires manual check.")
+  def _cast_transformer(parent, node, full_name, name, logs, errors):
+    """Transforms to_int and to_float to cast(..., dtype=...)."""
+
+    # Find out the dtype to cast to from the function name
+    dtype_str = name[3:]
+    # Special cases where the full dtype is not given
+    if dtype_str == "float":
+      dtype_str = "float32"
+    elif dtype_str == "double":
+      dtype_str = "float64"
+    new_arg = ast.keyword(arg="dtype",
+                          value=ast.Attribute(value=ast.Name(id="tf",
+                                                             ctx=ast.Load()),
+                                              attr=dtype_str, ctx=ast.Load()))
+    # Ensures a valid transformation when a positional name arg is given
+    if len(node.args) == 2:
+      name_arg = ast.keyword(arg="name",
+                             value=node.args[-1])
+      node.args = node.args[:-1]
+      node.keywords.append(name_arg)
+
+    # Python3 ast requires the args for the Attribute, but codegen will mess up
+    # the arg order if we just set them to 0.
+    new_arg.value.lineno = node.lineno
+    new_arg.value.col_offset = node.col_offset+100
+
+    node.keywords.append(new_arg)
+    if isinstance(node.func, ast.Attribute):
+      node.func.attr = "cast"
     else:
-      comment = ("WARNING: tf.nn.dropout has changed the semantics of the "
-                 "second argument. Please check the transformation.\n")
-      file_edit_recorder.add(
-          comment,
-          node.args[1].lineno,
-          node.args[1].col_offset,
-          "",
-          "1 - ")
+      assert isinstance(node.func, ast.Name)
+      node.func.id = "cast"
+
+    logs.append((node.lineno, node.col_offset,
+                 "Changed %s call to tf.cast(..., dtype=tf.%s)." % (full_name,
+                                                                    dtype_str)))
+    return node
 
   @staticmethod
-  def _colocate_handler(name):
-    def _helper(file_edit_recorder, node, lines):
-      """Handler for updating colocate arguments."""
-      del lines
-      for keyword in node.keywords:
-        if keyword.arg == "colocate_gradients_with_ops":
-          # TODO(jhseu): Since ast_edit.py does string replacement, there's no
-          # straightforward way to remove the argument. Try to fix before 2.0 is
-          # final.
-          comment = ("For tf.gradients and tf.Optimizer.minimize, "
-                     "colocate_gradients_with_op has been removed and now "
-                     "defaults to True.")
-          file_edit_recorder.add(
-              comment,
-              node.lineno,
-              node.col_offset,
-              "",
-              "",
-              error="{} requires manual check.".format(name))
-    return _helper
+  def _softmax_cross_entropy_with_logits_transformer(
+      parent, node, full_name, name, logs, errors):
+    def _wrap_label(parent, old_value):
+      """Wrap labels with tf.stop_gradient."""
+      if six.PY3:
+        new_value = ast.Call(
+            ast.Name(id="tf.stop_gradient", ctx=ast.Load()),
+            [old_value], [])
+      else:
+        new_value = ast.Call(
+            ast.Name(id="tf.stop_gradient", ctx=ast.Load()),
+            [old_value], [], None, None)
+
+      # This copies the prefix and suffix on old_value to new_value.
+      pasta.ast_utils.replace_child(parent, old_value, new_value)
+      ast.copy_location(new_value, old_value)
+
+    # Check if we have a labels keyword arg
+    for karg in node.keywords:
+      if karg.arg == "labels":
+        logs.append((node.lineno, node.col_offset,
+                     "Changing labels arg of "
+                     "tf.nn.softmax_cross_entropy_with_logits to "
+                     "tf.stop_gradient(labels). Please check this "
+                     "transformation.\n"))
+        _wrap_label(karg, karg.value)
+        return node
+    return node
 
   @staticmethod
-  def _batch_gather_handler(file_edit_recorder, node, lines):
-    lineno = node.lineno
-    column = node.col_offset
+  def _batch_gather_transformer(parent, node, full_name, name, logs, errors):
+    # Check if the call already has a batch_dims argument
+    if any([kw.arg == "batch_dims" for kw in node.keywords]):
+      logs.append((node.lineno, node.col_offset, "tf.batch_gather already has "
+                   "batch_dims argument. Neat."))
+      return None
+
+    minus_one = ast.Num(n=-1)
+    minus_one.lineno = 0
+    minus_one.col_offset = 0
+    new_arg = ast.keyword("batch_dims", minus_one)
+    node.keywords.append(new_arg)
+    logs.append((node.lineno, node.col_offset,
+                 "Added keyword argument batch_dims=-1 to tf.batch_gather."))
+    return node
 
-    # Find the position to add the batch_dims argument.  We add it as the
-    # first argument, since that's easiest.  This is safe because we included
-    # batch_gather in self.reordered_function_names, so it will have all
-    # of its arguments changed to keyword arguments.
-    m = re.match(r"tf\s*\.\s*batch_gather\s*\(", lines[lineno - 1][column:])
-    if m is not None:
-      file_edit_recorder.add(
-          "Added keyword argument 'batch_dims=-1' to 'tf.batch_gather'",
-          lineno, column + m.end(), "", "batch_dims=-1, ")
+  @staticmethod
+  def _image_resize_transformer(parent, node, full_name, name, logs, errors):
+    """Transforms image.resize_* to image.resize(..., method=*, ...)."""
+
+    resize_method = name[7:].upper()
+    new_arg = ast.keyword(arg="method",
+                          value=ast.Attribute(
+                              value=ast.Attribute(
+                                  value=ast.Attribute(
+                                      value=ast.Name(id="tf", ctx=ast.Load()),
+                                      attr="image", ctx=ast.Load()),
+                                  attr="ResizeMethod", ctx=ast.Load()),
+                              attr=resize_method, ctx=ast.Load()))
+
+    # Ensures a valid transformation when a positional name arg is given
+    if len(node.args) == 4:
+      pos_arg = ast.keyword(arg="preserve_aspect_ratio",
+                            value=node.args[-1])
+      node.args = node.args[:-1]
+      node.keywords.append(pos_arg)
+    if len(node.args) == 3:
+      pos_arg = ast.keyword(arg="align_corners",
+                            value=node.args[-1])
+      node.args = node.args[:-1]
+      node.keywords.append(pos_arg)
+
+    # Python3 ast requires the args for the Attribute, but codegen will mess up
+    # the arg order if we just set them to 0.
+    new_arg.value.lineno = node.lineno
+    new_arg.value.col_offset = node.col_offset+100
+
+    node.keywords.append(new_arg)
+    if isinstance(node.func, ast.Attribute):
+      node.func.attr = "resize"
     else:
-      file_edit_recorder.add(
-          "Unable to add keyword argument 'batch_dims=-1' to 'tf.batch_gather'",
-          lineno, column, "", "",
-          error="Unable to add keyword argument batch_dims=-1 to "
-          "tf.batch_gather; please add it manually.")
+      assert isinstance(node.func, ast.Name)
+      node.func.id = "resize"
+
+    logs.append((node.lineno, node.col_offset,
+                 "Changed %s call to tf.image.resize(..., "
+                 "method=tf.image.ResizeMethod.%s)." % (full_name,
+                                                        resize_method)))
+    return node
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_main.py b/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
index 543d0786423f5b3f9bc59895c1325d19b6241cf7..870bc6f2163f91eb4fd1e3c71a99bed022bf472f 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
@@ -59,6 +59,14 @@ Simple usage:
             "copy the other files."),
       type=bool,
       default=True)
+  parser.add_argument(
+      "--inplace",
+      dest="in_place",
+      help=("If converting a whole tree of files, whether to "
+            "allow the conversion to be performed on the "
+            "files in the input tree."),
+      type=bool,
+      default=False)
   parser.add_argument(
       "--reportfile",
       dest="report_filename",
@@ -86,7 +94,7 @@ Simple usage:
           "--outtree=<output directory> argument is required when converting a "
           "file tree.")
     files_processed, report_text, errors = upgrade.process_tree(
-        args.input_tree, args.output_tree, args.copy_other_files)
+        args.input_tree, args.output_tree, args.copy_other_files, args.in_place)
   else:
     parser.print_help()
   if report_text:
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
index 9a89eda3e0ddf4527c9cf8dbf6b5a19213f9ffb6..0a85cb39c8973251fb233a322c18b6aefd881d17 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
@@ -239,8 +239,8 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     }
     function_warnings = (
         tf_upgrade_v2.TFAPIChangeSpec().function_warnings)
-    function_handles = (
-        tf_upgrade_v2.TFAPIChangeSpec().function_handle)
+    function_transformers = (
+        tf_upgrade_v2.TFAPIChangeSpec().function_transformers)
     keyword_renames = (
         tf_upgrade_v2.TFAPIChangeSpec().function_keyword_renames)
 
@@ -255,7 +255,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
 
         for name in names_v1:
           tf_name = "tf.%s" % name
-          if tf_name in function_warnings or tf_name in function_handles:
+          if tf_name in function_warnings or tf_name in function_transformers:
             continue  # These require manual change
           if tf_name in v1_name_exceptions:
             continue
@@ -362,17 +362,95 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
 
       text = "%s(a, b)\n" % decay
       _, report, errors, _ = self._upgrade(text)
-      self.assertEqual(errors, ["test.py:1: %s requires manual check." % decay])
+      self.assertIn("%s requires manual check" % decay, errors[0])
       self.assertIn("%s has been changed" % decay, report)
 
   def testPiecewiseDecay(self):
     text = "tf.train.piecewise_constant_decay(a, b)\n"
     _, report, errors, _ = self._upgrade(text)
-    self.assertEqual(
-        errors,
-        ["test.py:1: tf.train.piecewise_constant_decay requires manual check."])
+    self.assertIn("tf.train.piecewise_constant_decay requires manual check",
+                  errors[0])
     self.assertIn("tf.train.piecewise_constant_decay has been changed", report)
 
+  def testMetrics(self):
+    metrics = [
+        "accuracy",
+        "auc",
+        "average_precision_at_k",
+        "false_negatives",
+        "false_negatives_at_thresholds",
+        "false_positives",
+        "false_positives_at_thresholds",
+        "mean",
+        "mean_absolute_error",
+        "mean_cosine_distance",
+        "mean_iou",
+        "mean_per_class_accuracy",
+        "mean_relative_error",
+        "mean_squared_error",
+        "mean_tensor",
+        "percentage_below",
+        "precision",
+        "precision_at_k",
+        "precision_at_thresholds",
+        "precision_at_top_k",
+        "recall",
+        "recall_at_k",
+        "recall_at_thresholds",
+        "recall_at_top_k",
+        "root_mean_squared_error",
+        "sensitivity_at_specificity",
+        "sparse_average_precision_at_k",
+        "sparse_precision_at_k",
+        "specificity_at_sensitivity",
+        "true_negatives",
+        "true_negatives_at_thresholds",
+        "true_positives",
+        "true_positives_at_thresholds",
+    ]
+    for m in metrics:
+      ns = "tf.metrics." + m
+      text = ns + "(a, b)"
+      _, report, errors, new_text = self._upgrade(text)
+      self.assertEqual("tf.compat.v1.metrics." + m + "(a, b)", new_text)
+      self.assertIn("test.py:1:0: %s requires manual check" % ns, errors[0])
+      self.assertIn(
+          "WARNING: tf.metrics have been converted to object oriented"
+          " versions in TF 2.0 and after. The metric function calls have been "
+          "converted to compat.v1 for backward compatibility. Please update "
+          "these calls to the TF 2.0 versions.", report)
+
+  def testLosses(self):
+    losses = [
+        "absolute_difference",
+        "add_loss",
+        "compute_weighted_loss",
+        "cosine_distance",
+        "get_losses",
+        "get_regularization_loss",
+        "get_regularization_losses",
+        "get_total_loss",
+        "hinge_loss",
+        "huber_loss",
+        "log_loss",
+        "mean_pairwise_squared_error",
+        "mean_squared_error",
+        "sigmoid_cross_entropy",
+        "softmax_cross_entropy",
+        "sparse_softmax_cross_entropy",
+    ]
+    for l in losses:
+      ns = "tf.losses." + l
+      text = ns + "(a, b)"
+      _, report, errors, new_text = self._upgrade(text)
+      self.assertEqual("tf.compat.v1.losses." + l + "(a, b)", new_text)
+      self.assertIn("test.py:1:0: %s requires manual check" % ns, errors[0])
+      self.assertIn(
+          "WARNING: tf.losses have been converted to object oriented"
+          " versions in TF 2.0 and after. The loss function calls have been "
+          "converted to compat.v1 for backward compatibility. Please update "
+          "these calls to the TF 2.0 versions.", report)
+
   def testEstimatorLossReductionChange(self):
     classes = [
         "LinearClassifier", "LinearRegressor", "DNNLinearCombinedClassifier",
@@ -384,7 +462,7 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
       text = ns + "(a, b)"
       _, report, errors, new_text = self._upgrade(text)
       self.assertEqual(text, new_text)
-      self.assertEqual(errors, ["test.py:1: %s requires manual check." % ns])
+      self.assertIn("%s requires manual check" % ns, errors[0])
       self.assertIn("loss_reduction has been changed", report)
 
   def testDropout(self):
@@ -392,15 +470,40 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(
         new_text,
-        "tf.nn.dropout(x, 1 - keep_prob, name=\"foo\")\n",
+        "tf.nn.dropout(x, 1 - (keep_prob), name=\"foo\")\n",
+    )
+
+    text = "tf.nn.dropout(x, keep_prob=.4, name=\"foo\")\n"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(
+        new_text,
+        "tf.nn.dropout(x, rate=1 - (.4), name=\"foo\")\n",
+    )
+
+    text = (
+        "tf.nn.dropout(x,  # Stuff before\n"
+        "              keep_prob=.4,  # Stuff after\n"
+        "              name=\"foo\")\n"
+    )
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(
+        new_text,
+        "tf.nn.dropout(x,  # Stuff before\n"
+        "              rate=1 - (.4),  # Stuff after\n"
+        "              name=\"foo\")\n",
     )
 
     text = "tf.nn.dropout(x)\n"
     _, unused_report, errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, text)
+    self.assertIn("tf.nn.dropout called without arguments", errors[0])
+
+  def testDropoutExpr(self):
+    text = "tf.nn.dropout(x, 1 - func(3 + 4.), name=\"foo\")\n"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(
-        errors,
-        ["test.py:1: tf.nn.dropout requires manual check."]
+        new_text,
+        "tf.nn.dropout(x, 1 - (1 - func(3 + 4.)), name=\"foo\")\n",
     )
 
   def testCountNonZeroChanges(self):
@@ -464,9 +567,11 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
 
     text = "tf.gradients(a, colocate_gradients_with_ops=False)\n"
     _, unused_report, errors, new_text = self._upgrade(text)
-    self.assertEqual(text, new_text)
-    self.assertEqual(errors, ["test.py:1: tf.gradients requires manual check."])
+    self.assertEqual("tf.gradients(a)\n", new_text)
+    self.assertIn("tf.gradients", errors[0])
+    self.assertIn("requires manual check", errors[0])
 
+  def testColocateGradientsWithOpsMinimize(self):
     text = "optimizer.minimize(a, foo=False)\n"
     _, unused_report, errors, new_text = self._upgrade(text)
     self.assertEqual(text, new_text)
@@ -474,10 +579,11 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
 
     text = "optimizer.minimize(a, colocate_gradients_with_ops=False)\n"
     _, unused_report, errors, new_text = self._upgrade(text)
-    self.assertEqual(text, new_text)
-    self.assertEqual(errors,
-                     ["test.py:1: Optimizer.minimize requires manual check."])
+    self.assertEqual("optimizer.minimize(a)\n", new_text)
+    self.assertIn("requires manual check", errors[0])
+    self.assertIn("minimize", errors[0])
 
+  def testColocateGradientsWithOpsComputeGradients(self):
     text = "optimizer.compute_gradients(a, foo=False)\n"
     _, unused_report, errors, new_text = self._upgrade(text)
     self.assertEqual(text, new_text)
@@ -485,10 +591,9 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
 
     text = "optimizer.compute_gradients(a, colocate_gradients_with_ops=False)\n"
     _, unused_report, errors, new_text = self._upgrade(text)
-    self.assertEqual(text, new_text)
-    self.assertEqual(errors,
-                     ["test.py:1: Optimizer.compute_gradients "
-                      "requires manual check."])
+    self.assertEqual("optimizer.compute_gradients(a)\n", new_text)
+    self.assertIn("requires manual check", errors[0])
+    self.assertIn("compute_gradients", errors[0])
 
   def testExportSavedModelRename(self):
     text = "self.est.export_savedmodel(path)"
@@ -579,26 +684,32 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
     self.assertEqual(new_text, expected_text)
 
   def testSoftMaxCrossEntropyWithLogitsV2(self):
-    text = "tf.nn.softmax_cross_entropy_with_logits_v2(labels, logits, dim=2)"
+    text = (
+        "tf.nn.softmax_cross_entropy_with_logits_v2("
+        "labels=labels, logits=logits, dim=2)")
     expected_text = (
-        "tf.nn.softmax_cross_entropy_with_logits(labels, logits, axis=2)")
+        "tf.nn.softmax_cross_entropy_with_logits("
+        "labels=labels, logits=logits, axis=2)")
     _, unused_report, errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
     self.assertFalse(errors)
 
   def testSoftMaxCrossEntropyWithLogits(self):
-    text = "tf.nn.softmax_cross_entropy_with_logits(labels, logits, dim=2)"
+    text = ("tf.nn.softmax_cross_entropy_with_logits("
+            "labels=labels, logits=logits, dim=2)")
     expected_text = (
-        "tf.nn.softmax_cross_entropy_with_logits(labels, logits, dim=2)")
-    _, report, errors, new_text = self._upgrade(text)
+        "tf.nn.softmax_cross_entropy_with_logits("
+        "labels=tf.stop_gradient(labels), logits=logits, axis=2)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
-    self.assertIn(
-        "tf.nn.softmax_cross_entropy_with_logits requires manual check.",
-        errors[0])
-    self.assertIn(
-        "tf.nn.softmax_cross_entropy_with_logits behavior has changed. ",
-        report)
+
+    text = ("tf.nn.softmax_cross_entropy_with_logits("
+            "labels=foo(bar))")
+    expected_text = ("tf.nn.softmax_cross_entropy_with_logits("
+                     "labels=tf.stop_gradient(foo(bar)))")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
 
   def testSparseMatmul(self):
     text = ("tf.sparse_matmul(a, b, c, d, e, f, g)\n")
@@ -738,27 +849,68 @@ tf.print('abc')
 
   def testBatchGather(self):
     text = "tf.batch_gather(foo, bar)"
-    expected_text = "tf.gather(batch_dims=-1, params=foo, indices=bar)"
+    expected_text1 = "tf.gather(params=foo, indices=bar, batch_dims=-1)"
+    expected_text2 = "tf.gather(batch_dims=-1, params=foo, indices=bar)"
     _, unused_report, unused_errors, new_text = self._upgrade(text)
-    self.assertEqual(new_text, expected_text)
+    self.assertIn(new_text, [expected_text1, expected_text2])
 
     text = "tf.batch_gather(params=foo, indices=bar)"
-    expected_text = "tf.gather(batch_dims=-1, params=foo, indices=bar)"
-    _, unused_report, unused_errors, new_text = self._upgrade(text)
-    self.assertEqual(new_text, expected_text)
-
-    text = "tf.batch_gather  (  foo, bar)"
-    expected_text = "tf.gather  (batch_dims=-1,   params=foo, indices=bar)"
-    _, unused_report, unused_errors, new_text = self._upgrade(text)
-    self.assertEqual(new_text, expected_text)
-
-    text = "(tf.batch_gather\n(foo, bar))"
-    expected_text = "(tf.gather\n(params=foo, indices=bar))"
-    expected_errors = ["test.py:1: Unable to add keyword argument batch_dims=-1"
-                       " to tf.batch_gather; please add it manually."]
+    expected_text1 = "tf.gather(params=foo, indices=bar, batch_dims=-1)"
+    expected_text2 = "tf.gather(batch_dims=-1, params=foo, indices=bar)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertIn(new_text, [expected_text1, expected_text2])
+
+  def testCast(self):
+    for (name, dtype) in [("int32", "int32"),
+                          ("int64", "int64"),
+                          ("float", "float32"),
+                          ("double", "float64"),
+                          ("complex64", "complex64"),
+                          ("complex128", "complex128"),
+                          ("bfloat16", "bfloat16")]:
+      text = "tf.to_%s(x, name='test')" % name
+      expected_text = "tf.cast(x, name='test', dtype=tf.%s)" % dtype
+      _, unused_report, unused_errors, new_text = self._upgrade(text)
+      self.assertEqual(expected_text, new_text)
+
+  def testCastPositionalSecondArgument(self):
+    for (name, dtype) in [("int32", "int32"),
+                          ("int64", "int64"),
+                          ("float", "float32"),
+                          ("double", "float64"),
+                          ("complex64", "complex64"),
+                          ("complex128", "complex128"),
+                          ("bfloat16", "bfloat16")]:
+      text = "tf.to_%s(x, 'test')" % name
+      expected_text = "tf.cast(x, name='test', dtype=tf.%s)" % dtype
+      _, unused_report, unused_errors, new_text = self._upgrade(text)
+      self.assertEqual(expected_text, new_text)
+
+  def testImageResize(self):
+    for method in ["bilinear", "area", "bicubic", "nearest_neighbor"]:
+      text = "tf.image.resize_%s(i, s)" % method
+      expected_text = ("tf.image.resize(i, s, "
+                       "method=tf.image.ResizeMethod.%s)" % method.upper())
+      _, unused_report, unused_errors, new_text = self._upgrade(text)
+      self.assertEqual(expected_text, new_text)
+
+  def testImageResizeExtraPositionalArgs(self):
+    for method in ["bilinear", "area", "bicubic", "nearest_neighbor"]:
+      text = "tf.image.resize_%s(i, s, a, p)" % method
+      expected_text = ["tf.image.resize(i, s, ", "align_corners=a, ",
+                       "preserve_aspect_ratio=p, ",
+                       "method=tf.image.ResizeMethod.%s)" % method.upper()]
+      _, unused_report, unused_errors, new_text = self._upgrade(text)
+      for s in expected_text:
+        self.assertIn(s, new_text)
+
+  def testCond(self):
+    text = "tf.cond(a, b, c, True)"
+    expected_text = "tf.cond(pred=a, true_fn=b, false_fn=c)"
     _, unused_report, errors, new_text = self._upgrade(text)
-    self.assertEqual(errors, expected_errors)
-    self.assertEqual(new_text, expected_text)
+    self.assertEqual(expected_text, new_text)
+    self.assertIn("tf.cond", errors[0])
+    self.assertIn("requires manual check", errors[0])
 
 
 class TestUpgradeFiles(test_util.TensorFlowTestCase):
@@ -778,4 +930,3 @@ class TestUpgradeFiles(test_util.TensorFlowTestCase):
 
 if __name__ == "__main__":
   test_lib.main()
-
diff --git a/tensorflow/tools/dist_test/scripts_allreduce/k8s_generate_yaml_lib.py b/tensorflow/tools/dist_test/scripts_allreduce/k8s_generate_yaml_lib.py
index c570d1a9f834bd9df57df62088a0c4562be9512c..038a712d538fbaeb8d0d176287704993cff07799 100644
--- a/tensorflow/tools/dist_test/scripts_allreduce/k8s_generate_yaml_lib.py
+++ b/tensorflow/tools/dist_test/scripts_allreduce/k8s_generate_yaml_lib.py
@@ -195,7 +195,7 @@ def generate_RSA(bits=2048, exponent=65537):
 
 def get_change_ssh_port(use_hostnet, port):
   if use_hostnet == 1:
-    return "sed -i '/Port 22/c\Port {}' /etc/ssh/sshd_config".format(port)
+    return r"sed -i '/Port 22/c\Port {}' /etc/ssh/sshd_config".format(port)
 
   return ''
 
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 1ad359ddccc71201553803140fa4efca06fbb5e1..e085ee7170c83729cb103811d5e2ba45e3d8cb96 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -15,8 +15,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         git \
         libcudnn7=7.2.1.38-1+cuda9.0 \
         libcudnn7-dev=7.2.1.38-1+cuda9.0 \
-        libnccl2=2.2.13-1+cuda9.0 \
-        libnccl-dev=2.2.13-1+cuda9.0 \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
@@ -41,11 +39,6 @@ RUN apt-get update && \
         apt-get install libnvinfer4=4.1.2-1+cuda9.0 && \
         apt-get install libnvinfer-dev=4.1.2-1+cuda9.0
 
-# Link NCCL libray and header where the build script expects them.
-RUN mkdir /usr/local/cuda-9.0/lib &&  \
-    ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
-    ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
-
 RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
     python get-pip.py && \
     rm get-pip.py
@@ -111,9 +104,6 @@ ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
 ENV TF_CUDA_VERSION=9.0
 ENV TF_CUDNN_VERSION=7
 
-# NCCL 2.x
-ENV TF_NCCL_VERSION=2
-
 RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
     LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} \
     tensorflow/tools/ci_build/builds/configured GPU \
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index 645d817d9f9d848b052445d3ada869e10810137e..c2449da9239df74eac5c6b1cd91df666e170a108 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -29,8 +29,8 @@ from __future__ import print_function
 import argparse
 import json
 import os
-import subprocess
 import shutil
+import subprocess
 
 
 def parse_branch_ref(filename):
@@ -174,8 +174,8 @@ def get_git_version(git_base_path, git_tag_override):
       # There might be "-" in the tag name. But we can be sure that the final
       # two "-" are those inserted by the git describe command.
       abbrev_commit = split_val[-1]
-      val = bytes(
-          version_separator.join([git_tag_override, "0", abbrev_commit]))
+      val = version_separator.join(
+          [bytes(git_tag_override, "utf-8"), b"0", abbrev_commit])
     return val if val else unknown_label
   except (subprocess.CalledProcessError, OSError):
     return unknown_label
diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index eb1ed1f2ca859df42809084c1ea47a6f3b21012e..f229099e493d720d3658a06efd7aec9720de27d8 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -12,6 +12,7 @@ load(
     "tf_cc_binary",
     "tf_cc_test",
     "tf_py_test",
+    "if_not_v2",
 )
 
 exports_files(["LICENSE"])
@@ -131,12 +132,13 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
-        "//tensorflow/contrib/rnn:gru_ops_op_lib",
-        "//tensorflow/contrib/rnn:lstm_ops_op_lib",
         "//tensorflow/core/kernels:quantization_utils",
     ] + if_not_windows([
         "//tensorflow/core/kernels:remote_fused_graph_rewriter_transform",
         "//tensorflow/core/kernels/hexagon:hexagon_rewriter_transform",
+    ]) + if_not_v2([
+        "//tensorflow/contrib/rnn:gru_ops_op_lib",
+        "//tensorflow/contrib/rnn:lstm_ops_op_lib",
     ]),
     alwayslink = 1,
 )
diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
index 435f46c107cd9b0a6d64d4c0d52607ec5f41eb4f..6c7174926d06460556ce673a5fe738901134543d 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
@@ -291,7 +291,7 @@ class FoldOldBatchNormsTest : public ::testing::Test {
     std::vector<Tensor> fused_outputs;
     TF_ASSERT_OK(fused_session->Run({}, {"output"}, {}, &fused_outputs));
 
-    test::ExpectTensorNear<float>(original_outputs[0], fused_outputs[0], 1e-5);
+    test::ExpectClose(original_outputs[0], fused_outputs[0]);
 
     for (const NodeDef& node : fused_graph_def.node()) {
       EXPECT_NE("FusedBatchNorm", node.op());
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 1186189844aa887ba011b532df3a73d89ffe52b8..86bd5107924ec4627b955264b179a06231ef8532 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -162,6 +162,7 @@ genrule(
         "//conditions:default": [],
     }) + if_cuda([
         "@cub_archive//:LICENSE.TXT",
+        "@local_config_nccl//:LICENSE",
     ]) + if_mkl([
         "//third_party/mkl:LICENSE",
         "//third_party/mkl_dnn:LICENSE",
@@ -232,6 +233,7 @@ genrule(
         "//conditions:default": [],
     }) + if_cuda([
         "@cub_archive//:LICENSE.TXT",
+        "@local_config_nccl//:LICENSE",
     ]) + if_mkl([
         "//third_party/mkl:LICENSE",
         "//third_party/mkl_dnn:LICENSE",
diff --git a/tensorflow/tools/optimization/BUILD b/tensorflow/tools/optimization/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..aa6c850b0b3abb3351e3225e0c3a66ab4272846e
--- /dev/null
+++ b/tensorflow/tools/optimization/BUILD
@@ -0,0 +1,52 @@
+# Description:
+#   Utilities that perform useful transformations on graphs
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_binary",
+    "tf_cuda_library",
+)
+
+exports_files(["LICENSE"])
+
+tf_cuda_library(
+    name = "optimization_pass_runner_lib",
+    srcs = ["optimization_pass_runner.cc"],
+    hdrs = ["optimization_pass_runner.h"],
+    deps = [
+        "//tensorflow/contrib:contrib_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+    ],
+)
+
+tf_cc_binary(
+    name = "gpu_optimization_pass_runner",
+    srcs = ["gpu_optimization_pass_runner_main.cc"],
+    deps = [
+        ":optimization_pass_runner_lib",
+        "//tensorflow/compiler/jit:xla_cpu_jit",
+        "//tensorflow/compiler/jit:xla_gpu_jit",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/contrib:contrib_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "@com_google_absl//absl/strings",
+    ],
+)
diff --git a/tensorflow/tools/optimization/gpu_optimization_pass_runner_main.cc b/tensorflow/tools/optimization/gpu_optimization_pass_runner_main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0d9f26cd5a42f7315cc1d074e8b6ec19caa75f30
--- /dev/null
+++ b/tensorflow/tools/optimization/gpu_optimization_pass_runner_main.cc
@@ -0,0 +1,60 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This file creates a binary that can run any registered optimization pass.
+// ./xla_gpu_opt  --input_file_path=/tmp/input.pbtxt
+// --output_file_path=/tmp/output.pbtxt
+// --optimization_pass=NameOfGraphOptimizationPass
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/tools/optimization/optimization_pass_runner.h"
+
+int main(int argc, char** argv) {
+  tensorflow::OptimizationPassRunner runner;
+  // Add fake devices for CPU, GPU, and XLA to ensure we have all devices we
+  // need.
+  // Most machines in our servers currently use 8 gpus. There is nothing special
+  // about this number and it can be decreased or increased to test other
+  // configurations.
+  int num_gpus_per_machine = 8;
+  for (int i = 0; i < num_gpus_per_machine; i++) {
+    TF_CHECK_OK(runner.AddDevice(
+        absl::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i),
+        tensorflow::DEVICE_CPU));
+    TF_CHECK_OK(runner.AddDevice(
+        absl::StrCat("/job:localhost/replica:0/task:0/device:GPU:", i),
+        tensorflow::DEVICE_GPU));
+    TF_CHECK_OK(runner.AddDevice(
+        absl::StrCat("/job:localhost/replica:0/task:0/device:XLA_CPU:", i),
+        tensorflow::DEVICE_XLA_CPU));
+    TF_CHECK_OK(runner.AddDevice(
+        absl::StrCat("/job:localhost/replica:0/task:0/device:XLA_GPU:", i),
+        tensorflow::DEVICE_XLA_GPU));
+    TF_CHECK_OK(runner.AddDevice(
+        absl::StrCat("/job:localhost/replica:0/task:0/device:CPU_XLA_JIT:", i),
+        tensorflow::DEVICE_CPU_XLA_JIT));
+    TF_CHECK_OK(runner.AddDevice(
+        absl::StrCat("/job:localhost/replica:0/task:0/device:GPU_XLA_JIT:", i),
+        tensorflow::DEVICE_GPU_XLA_JIT));
+  }
+  // This binary is used to test TF:XLA behavior, so turn on auto_jit.
+  TF_CHECK_OK(runner.SetJitLevel(tensorflow::OptimizerOptions::GlobalJitLevel::
+                                     OptimizerOptions_GlobalJitLevel_ON_2));
+  // Run the actual "main" function.
+  TF_CHECK_OK(runner.RunMain(argc, argv));
+}
diff --git a/tensorflow/tools/optimization/optimization_pass_runner.cc b/tensorflow/tools/optimization/optimization_pass_runner.cc
new file mode 100644
index 0000000000000000000000000000000000000000..231ff083813870819c23729e4308e0215661afcd
--- /dev/null
+++ b/tensorflow/tools/optimization/optimization_pass_runner.cc
@@ -0,0 +1,167 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This file creates a library that can run any registered optimization pass.
+// The binary that uses this will be run in a form similar to:
+// ./optimization_pass_runner  --input_file_path=/tmp/input.pbtxt
+// --output_file_path=/tmp/output.pbtxt
+// --optimization_pass=NameOfGraphOptimizationPass
+#include "tensorflow/tools/optimization/optimization_pass_runner.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace tensorflow {
+
+namespace {
+// A fake device used to populate a DeviceSet.
+class FakeDevice : public Device {
+ private:
+  explicit FakeDevice(const DeviceAttributes& device_attributes)
+      : Device(nullptr, device_attributes) {}
+
+ public:
+  Status Sync() override;
+  static std::unique_ptr<Device> Make(const string& name, const string& type);
+};
+
+Status FakeDevice::Sync() {
+  return errors::Unimplemented("FakeDevice::Sync()");
+}
+
+std::unique_ptr<Device> FakeDevice::Make(const string& name,
+                                         const string& type) {
+  DeviceAttributes device_attributes;
+  device_attributes.set_name(name);
+  device_attributes.set_device_type(DeviceType(type).type());
+  return std::unique_ptr<Device>(new FakeDevice(device_attributes));
+}
+}  // namespace
+
+Status OptimizationPassRunner::RunMain(int argc, char** argv) {
+  string input_file_path;
+  string output_file_path;
+  string optimization_pass;
+
+  const std::vector<Flag> flag_list = {
+      Flag("input_file_path", &input_file_path, "Location of the input graph."),
+      Flag("output_file_path", &output_file_path,
+           "Location to write the resulting graph."),
+      // For now only a single optimization pass can be run.
+      Flag("optimization_pass", &optimization_pass,
+           "Which optimization pass to run."),
+  };
+  if (!Flags::Parse(&argc, argv, flag_list)) {
+    return errors::FailedPrecondition("Invalid flags passed");
+  }
+  port::InitMain(argv[0], &argc, &argv);
+
+  if (input_file_path.empty()) {
+    return errors::FailedPrecondition("input_file_path is a required flag.");
+  }
+  if (output_file_path.empty()) {
+    return errors::FailedPrecondition("output_file_path is a required flag.");
+  }
+  if (optimization_pass.empty()) {
+    return errors::FailedPrecondition("optimization_pass is a required flag.");
+  }
+
+  // Turn on XLA Auto-Jit.
+  auto session_options = absl::make_unique<SessionOptions>();
+  session_options->config.mutable_graph_options()
+      ->mutable_optimizer_options()
+      ->set_global_jit_level(jit_level_);
+  FunctionDefLibrary flib;
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+
+  GraphOptimizationPassOptions options;
+  options.session_options = session_options.release();
+  options.graph = &graph;
+  options.flib_def =
+      new FunctionLibraryDefinition((*options.graph)->op_registry(), flib);
+
+  // Grab the data
+  GraphDef graphdef;
+  GraphConstructorOptions graph_opts;
+  graph_opts.expect_device_spec = true;
+  graph_opts.allow_internal_ops = true;
+  TF_RETURN_IF_ERROR(ReadTextProto(Env::Default(), input_file_path, &graphdef));
+  TF_RETURN_IF_ERROR(
+      ConvertGraphDefToGraph(graph_opts, graphdef, options.graph->get()));
+
+  // Add all devices that were previously configured with AddDevice.
+  DeviceSet device_set;
+  for (auto& device : devices_) {
+    device_set.AddDevice(device.get());
+  }
+  options.device_set = &device_set;
+
+  Status result = errors::NotFound(
+      "An OptimizationPass was not found with the desired name.");
+
+  // Run the optimization pass specified by the command line flag.
+  for (const auto& groups_and_passes :
+       OptimizationPassRegistry::Global()->groups()) {
+    for (const auto& phase_and_passes : groups_and_passes.second) {
+      for (const auto& pass : phase_and_passes.second) {
+        if (pass->name() == optimization_pass) {
+          result = pass->Run(options);
+        }
+      }
+    }
+  }
+
+  TF_RETURN_IF_ERROR(result);
+
+  // Write out the result.
+  options.graph->get()->ToGraphDef(&graphdef);
+  TF_RETURN_IF_ERROR(
+      WriteTextProto(Env::Default(), output_file_path, graphdef));
+  return Status::OK();
+}
+
+Status OptimizationPassRunner::SetJitLevel(
+    OptimizerOptions::GlobalJitLevel jit_level) {
+  jit_level_ = jit_level;
+  return Status::OK();
+}
+
+Status OptimizationPassRunner::AddDevice(const string& name,
+                                         const string& type) {
+  devices_.push_back(FakeDevice::Make(name, type));
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/tools/optimization/optimization_pass_runner.h b/tensorflow/tools/optimization/optimization_pass_runner.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b26f64bcfb86e5e7fd6b6fe31b20cf75f931da1
--- /dev/null
+++ b/tensorflow/tools/optimization/optimization_pass_runner.h
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_TOOLS_OPTIMIZATION_OPTIMIZATION_PASS_RUNNER_H_
+#define TENSORFLOW_TOOLS_OPTIMIZATION_OPTIMIZATION_PASS_RUNNER_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+// OptimizationPassRunner can be initialized, populated with devices, then run
+// to test individual Tensorflow Optimization passes.
+class OptimizationPassRunner {
+ public:
+  explicit OptimizationPassRunner()
+      : jit_level_(OptimizerOptions::GlobalJitLevel::
+                       OptimizerOptions_GlobalJitLevel_DEFAULT) {}
+
+  // Add a fake device to the (initially empty) DeviceSet used for optimization.
+  // Names are of the form: "/job:localhost/replica:0/task:0/device:CPU:0"
+  Status AddDevice(const string& name, const string& type);
+
+  // Increasing the Jit level will cause XLA to compile parts of the tensorflow
+  // graph that it is able to.
+  Status SetJitLevel(OptimizerOptions::GlobalJitLevel jit_level);
+
+  // This can be called after adding devices and setting the jit level to parse
+  // command line flags and run the specified job. All 3 flags are required:
+  // input_file_path, output_file_path, optimization_pass.
+  //
+  // If this library becomes heavily used, the caller should be responsible for
+  // parsing any command line flags desired rather than this Method handling the
+  // work of a main() function.
+  Status RunMain(int argc, char** argv);
+
+ private:
+  OptimizerOptions::GlobalJitLevel jit_level_;
+  std::vector<std::unique_ptr<Device>> devices_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_TOOLS_OPTIMIZATION_OPTIMIZATION_PASS_RUNNER_H_
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 93a85763f51d3fac5607bd8677f835aaa73e99cb..c51b45a49c4010229bc8a7c20958b57c23139e6a 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -169,6 +169,7 @@ filegroup(
         "@local_config_sycl//sycl:LICENSE.text",
         "@nasm//:LICENSE",
         "@nsync//:LICENSE",
+        "@pasta//:LICENSE",
         "@pcre//:LICENCE",
         "@png_archive//:LICENSE",
         "@protobuf_archive//:LICENSE",
diff --git a/tensorflow/tools/pip_package/MANIFEST.in b/tensorflow/tools/pip_package/MANIFEST.in
index 272ff4735c34b319589bd9302fcdb5cd91b6d1ec..c304e8cf6ebe1739c1cc9011dafd8f89cae9baac 100644
--- a/tensorflow/tools/pip_package/MANIFEST.in
+++ b/tensorflow/tools/pip_package/MANIFEST.in
@@ -6,7 +6,6 @@ recursive-include * *.so
 recursive-include * *.dll
 recursive-include * *.lib
 recursive-include * *.csv
-recursive-include tensorflow/aux-bin *
 recursive-include tensorflow/include/tensorflow *.h
 recursive-include tensorflow/include/Eigen *
 recursive-include tensorflow/include/external *
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 439b5428b3b7bff651689e08e783bf7875f16319..27815491d23a6ec294f08b1b5eee5ed2d11e9766 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -118,9 +118,6 @@ function prepare_src() {
         fi
       fi
     fi
-    mkdir "${TMPDIR}/tensorflow/aux-bin"
-    # Install toco as a binary in aux-bin.
-    cp bazel-bin/tensorflow/lite/python/tflite_convert ${TMPDIR}/tensorflow/aux-bin/
   fi
 
   # protobuf pip package doesn't ship with header files. Copy the headers
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index ff821b864300c1eeb2f9d290ae47a25ce87a0884..952c71c61580fba72dbf1a4b2e1bd836816b1420 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -30,14 +30,19 @@ os.chdir(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")))
 PIP_PACKAGE_QUERY_EXPRESSION = (
     "deps(//tensorflow/tools/pip_package:build_pip_package)")
 
+# List of file paths containing BUILD files that should not be included for the
+# pip smoke test.
+BUILD_BLACKLIST = [
+    "tensorflow/lite/examples/android",
+    "tensorflow/lite/experimental/swift",
+]
 
 def GetBuild(dir_base):
   """Get the list of BUILD file all targets recursively startind at dir_base."""
   items = []
   for root, _, files in os.walk(dir_base):
     for name in files:
-      if (name == "BUILD" and
-          root.find("tensorflow/lite/examples/android") == -1):
+      if (name == "BUILD" and root not in BUILD_BLACKLIST):
         items.append("//" + root + ":all")
   return items
 
@@ -67,9 +72,9 @@ def BuildPyTestDependencies():
 
 PYTHON_TARGETS, PY_TEST_QUERY_EXPRESSION = BuildPyTestDependencies()
 
-# Hard-coded blacklist of files if not included in pip package
 # TODO(amitpatankar): Clean up blacklist.
-BLACKLIST = [
+# List of dependencies that should not included in the pip package.
+DEPENDENCY_BLACKLIST = [
     "//tensorflow/python:extra_py_tests_deps",
     "//tensorflow/cc/saved_model:saved_model_half_plus_two",
     "//tensorflow:no_tensorflow_py_deps",
@@ -82,9 +87,7 @@ BLACKLIST = [
     "//tensorflow/core/kernels/cloud:bigquery_reader_ops",
     "//tensorflow/python/feature_column:vocabulary_testdata",
     "//tensorflow/python:framework/test_file_system.so",
-    # contrib
-    "//tensorflow/contrib/session_bundle:session_bundle_half_plus_two",
-    "//tensorflow/contrib/keras:testing_utils",
+    # lite
     "//tensorflow/lite/experimental/examples/lstm:tflite_lstm",
     "//tensorflow/lite/experimental/examples/lstm:tflite_lstm.py",
     "//tensorflow/lite/experimental/examples/lstm:unidirectional_sequence_lstm_test",  # pylint:disable=line-too-long
@@ -93,6 +96,9 @@ BLACKLIST = [
     "//tensorflow/lite/python:interpreter_test",
     "//tensorflow/lite/python:interpreter.py",
     "//tensorflow/lite/python:interpreter_test.py",
+    # contrib
+    "//tensorflow/contrib/session_bundle:session_bundle_half_plus_two",
+    "//tensorflow/contrib/keras:testing_utils",
     "//tensorflow/contrib/ffmpeg:test_data",
     "//tensorflow/contrib/fused_conv:fused_conv2d_bias_activation_op_test_base",
     "//tensorflow/contrib/hadoop:test_data",
@@ -102,6 +108,7 @@ BLACKLIST = [
     "//tensorflow/contrib/framework:checkpoint_ops_testdata",
     "//tensorflow/contrib/bayesflow:reinforce_simple_example",
     "//tensorflow/contrib/bayesflow:examples/reinforce_simple/reinforce_simple_example.py",  # pylint:disable=line-too-long
+    "//tensorflow/contrib/saved_model:reader",  # Not present in v2
     "//tensorflow/contrib/timeseries/examples:predict",
     "//tensorflow/contrib/timeseries/examples:multivariate",
     "//tensorflow/contrib/timeseries/examples:known_anomaly",
@@ -148,8 +155,8 @@ def main():
   # File extensions and endings to ignore
   ignore_extensions = ["_test", "_test.py", "_test_gpu", "_test_gpu.py"]
 
-  ignored_files = 0
-  blacklisted_files = len(BLACKLIST)
+  ignored_files_count = 0
+  blacklisted_dependencies_count = len(DEPENDENCY_BLACKLIST)
   # Compare dependencies
   for dependency in tf_py_test_dependencies_list:
     if dependency and dependency.startswith("//tensorflow"):
@@ -157,16 +164,16 @@ def main():
       # Ignore extensions
       if any(dependency.endswith(ext) for ext in ignore_extensions):
         ignore = True
-        ignored_files += 1
+        ignored_files_count += 1
 
-      # Check if the dependency is in the pip package, the blacklist, or
-      # should be ignored because of its file extension
+      # Check if the dependency is in the pip package, the dependency blacklist,
+      # or should be ignored because of its file extension.
       if not (ignore or dependency in pip_package_dependencies_list or
-              dependency in BLACKLIST):
+              dependency in DEPENDENCY_BLACKLIST):
         missing_dependencies.append(dependency)
 
-  print("Ignored files: %d" % ignored_files)
-  print("Blacklisted files: %d" % blacklisted_files)
+  print("Ignored files count: %d" % ignored_files_count)
+  print("Blacklisted dependencies count: %d" % blacklisted_dependencies_count)
   if missing_dependencies:
     print("Missing the following dependencies from pip_packages:")
     for missing_dependency in missing_dependencies:
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 3927540cc79ef8b827ce4d7e60e884c2237f8e9d..55b7046e309f0b14e10a7978533c947aea5c5230 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -51,6 +51,7 @@ REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
     'astor >= 0.6.0',
     'gast >= 0.2.0',
+    'google_pasta >= 0.1.0',
     'keras_applications >= 1.0.6',
     'keras_preprocessing >= 1.0.5',
     'numpy >= 1.13.3',
@@ -248,7 +249,7 @@ setup(
     url='https://www.tensorflow.org/',
     download_url='https://github.com/tensorflow/tensorflow/tags',
     author='Google Inc.',
-    author_email='opensource@google.com',
+    author_email='packages@tensorflow.org',
     # Contained modules and scripts.
     packages=find_packages(),
     entry_points={
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 157cb528063c7a8158c7f331fde22dc62122e19c..a7f3665a31c15cf50f0314ef061f62186db64557 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -29,6 +29,7 @@ load("//third_party/jpeg:workspace.bzl", jpeg = "repo")
 load("//third_party/nasm:workspace.bzl", nasm = "repo")
 load("//third_party/kissfft:workspace.bzl", kissfft = "repo")
 load("//third_party/keras_applications_archive:workspace.bzl", keras_applications = "repo")
+load("//third_party/pasta:workspace.bzl", pasta = "repo")
 
 def initialize_third_party():
     """ Load third party repositories.  See above load() statements. """
@@ -41,6 +42,7 @@ def initialize_third_party():
     kissfft()
     jpeg()
     nasm()
+    pasta()
 
 # Sanitize a dependency so that it works correctly from code that includes
 # TensorFlow as a submodule.
@@ -136,11 +138,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
-        sha256 = "753fbb58d0a49b6bcbcfb126ebfa2e21fc97f7471529ba835a096008ce588d8a",
-        strip_prefix = "eigen-eigen-9f48e814419e",
+        sha256 = "9de38f2d162c51599b802f7c36d9f3773980d19ac908c61638f8344d2c10e1ca",
+        strip_prefix = "eigen-eigen-88fc23324517",
         urls = [
-            "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/9f48e814419e.tar.gz",
-            "https://bitbucket.org/eigen/eigen/get/9f48e814419e.tar.gz",
+            "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/88fc23324517..tar.gz",
+            "https://bitbucket.org/eigen/eigen/get/88fc23324517.tar.gz",
         ],
     )
 
@@ -283,7 +285,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         system_build_file = clean_dep("//third_party/systemlibs:astor.BUILD"),
         urls = [
             "https://mirror.bazel.build/pypi.python.org/packages/99/80/f9482277c919d28bebd85813c0a70117214149a96b08981b72b63240b84c/astor-0.7.1.tar.gz",
-            "https://files.pythonhosted.org/packages/99/80/f9482277c919d28bebd85813c0a70117214149a96b08981b72b63240b84c/astor-0.7.1.tar.gz",
+            "https://pypi.python.org/packages/99/80/f9482277c919d28bebd85813c0a70117214149a96b08981b72b63240b84c/astor-0.7.1.tar.gz",
         ],
     )
 
@@ -393,22 +395,22 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "nsync",
-        sha256 = "692f9b30e219f71a6371b98edd39cef3cbda35ac3abc4cd99ce19db430a5591a",
-        strip_prefix = "nsync-1.20.1",
+        sha256 = "704be7f58afa47b99476bbac7aafd1a9db4357cef519db361716f13538547ffd",
+        strip_prefix = "nsync-1.20.2",
         system_build_file = clean_dep("//third_party/systemlibs:nsync.BUILD"),
         urls = [
-            "https://mirror.bazel.build/github.com/google/nsync/archive/1.20.1.tar.gz",
-            "https://github.com/google/nsync/archive/1.20.1.tar.gz",
+            "https://mirror.bazel.build/github.com/google/nsync/archive/1.20.2.tar.gz",
+            "https://github.com/google/nsync/archive/1.20.2.tar.gz",
         ],
     )
 
     tf_http_archive(
         name = "com_google_googletest",
-        sha256 = "353ab86e35cea1cd386115279cf4b16695bbf21b897bfbf2721cf4cb5f64ade8",
-        strip_prefix = "googletest-997d343dd680e541ef96ce71ee54a91daf2577a0",
+        sha256 = "61eee610f136c1edc693d979647a4bb2ca253d60e6964724b61af85d32a41251",
+        strip_prefix = "googletest-6729a1361150131bc5d394d5cd2b4cdf0953ee7b",
         urls = [
-            "https://mirror.bazel.build/github.com/google/googletest/archive/997d343dd680e541ef96ce71ee54a91daf2577a0.zip",
-            "https://github.com/google/googletest/archive/997d343dd680e541ef96ce71ee54a91daf2577a0.zip",
+            "https://mirror.bazel.build/github.com/google/googletest/archive/6729a1361150131bc5d394d5cd2b4cdf0953ee7b.zip",
+            "https://github.com/google/googletest/archive/6729a1361150131bc5d394d5cd2b4cdf0953ee7b.zip",
         ],
     )
 
@@ -498,11 +500,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "llvm",
         build_file = clean_dep("//third_party/llvm:llvm.autogenerated.BUILD"),
-        sha256 = "65a1aeb29e5940f9f480a41e904659d944e738458afd139caa7bde14bd6aab8a",
-        strip_prefix = "llvm-331ffd31b3dd49b3f02a27556938b836b679f564",
+        sha256 = "83a4f199742f3d6892994dd6dc46d6a53019aedaa28590b460c120f3dfc7bc47",
+        strip_prefix = "llvm-671f057a2cf137914be6c786daf8000469adebab",
         urls = [
-            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/331ffd31b3dd49b3f02a27556938b836b679f564.tar.gz",
-            "https://github.com/llvm-mirror/llvm/archive/331ffd31b3dd49b3f02a27556938b836b679f564.tar.gz",
+            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/671f057a2cf137914be6c786daf8000469adebab.tar.gz",
+            "https://github.com/llvm-mirror/llvm/archive/671f057a2cf137914be6c786daf8000469adebab.tar.gz",
         ],
     )
 
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 8aa5b89cddb336380d35f85a6ecd3ebdf6589e88..8de4fe58e57dcce8f96553c21dd9250a22f43b11 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -400,7 +400,7 @@ def _cuda_include_path(repository_ctx, cuda_config):
   return "\n".join(inc_entries)
 
 
-def _enable_cuda(repository_ctx):
+def enable_cuda(repository_ctx):
   if "TF_NEED_CUDA" in repository_ctx.os.environ:
     enable_cuda = repository_ctx.os.environ["TF_NEED_CUDA"].strip()
     return enable_cuda == "1"
@@ -1545,7 +1545,7 @@ def _create_remote_cuda_repository(repository_ctx, remote_config_repo):
 
 def _cuda_autoconf_impl(repository_ctx):
   """Implementation of the cuda_autoconf repository rule."""
-  if not _enable_cuda(repository_ctx):
+  if not enable_cuda(repository_ctx):
     _create_dummy_repository(repository_ctx)
   elif _TF_CUDA_CONFIG_REPO in repository_ctx.os.environ:
     _create_remote_cuda_repository(
diff --git a/third_party/nccl/archive.BUILD b/third_party/nccl/archive.BUILD
index 211b794bb0673d433273afb15d7966e0b051a37f..5901c6b296fa0f4da8061b2b44daed18cd0b3558 100644
--- a/third_party/nccl/archive.BUILD
+++ b/third_party/nccl/archive.BUILD
@@ -96,7 +96,7 @@ cc_library(
     ],
     hdrs = ["nccl.h"],
     copts = cuda_default_copts() + ["-Wno-vla"],
-    include_prefix = "third_party/nccl/",
+    include_prefix = "third_party/nccl",
     visibility = ["//visibility:public"],
     deps = [
         ":device",
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index 1e6422b49ef4d7ce97b3b38f3b3964281a158b7c..f7d7d553838217de33b40df67bada95778b78f55 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -13,7 +13,9 @@ load(
     "auto_configure_fail",
     "compute_capabilities",
     "cuda_toolkit_path",
+    "enable_cuda",
     "find_cuda_define",
+    "get_cpu_value",
     "matches_version",
 )
 
@@ -22,7 +24,7 @@ _NCCL_HDR_PATH = "NCCL_HDR_PATH"
 _NCCL_INSTALL_PATH = "NCCL_INSTALL_PATH"
 _TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
 _TF_NCCL_VERSION = "TF_NCCL_VERSION"
-_TF_NCCL_CONFIG_REPO = "TF_NCCL_CONFIG_REPO"
+_TF_NEED_CUDA = "TF_NEED_CUDA"
 
 _DEFINE_NCCL_MAJOR = "#define NCCL_MAJOR"
 _DEFINE_NCCL_MINOR = "#define NCCL_MINOR"
@@ -116,26 +118,23 @@ def _check_nccl_version(repository_ctx, nccl_install_path, nccl_hdr_path, nccl_v
     header_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
     if not matches_version(nccl_version, header_version):
         auto_configure_fail(
-            ("NCCL library version detected from %s/nccl.h (%s) does not match " +
-             "TF_NCCL_VERSION (%s). To fix this rerun configure again.") %
+            ("NCCL library version detected from %s/nccl.h (%s) does not " +
+             "match TF_NCCL_VERSION (%s). To fix this rerun configure again.") %
             (header_dir, header_version, nccl_version),
         )
 
 def _nccl_configure_impl(repository_ctx):
     """Implementation of the nccl_configure repository rule."""
-    if _TF_NCCL_VERSION not in repository_ctx.os.environ:
+    if not enable_cuda(repository_ctx) or \
+       get_cpu_value(repository_ctx) not in ("Linux", "FreeBSD"):
         # Add a dummy build file to make bazel query happy.
         repository_ctx.file("BUILD", _NCCL_DUMMY_BUILD_CONTENT)
         return
 
-    if _TF_NCCL_CONFIG_REPO in repository_ctx.os.environ:
-        # Forward to the pre-configured remote repository.
-        repository_ctx.template("BUILD", _label("remote.BUILD.tpl"), {
-            "%{target}": repository_ctx.os.environ[_TF_NCCL_CONFIG_REPO],
-        })
-        return
+    nccl_version = ""
+    if _TF_NCCL_VERSION in repository_ctx.os.environ:
+        nccl_version = repository_ctx.os.environ[_TF_NCCL_VERSION].strip()
 
-    nccl_version = repository_ctx.os.environ[_TF_NCCL_VERSION].strip()
     if nccl_version == "":
         # Alias to open source build from @nccl_archive.
         repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
@@ -179,7 +178,7 @@ nccl_configure = repository_rule(
         _NCCL_INSTALL_PATH,
         _TF_NCCL_VERSION,
         _TF_CUDA_COMPUTE_CAPABILITIES,
-        _TF_NCCL_CONFIG_REPO,
+        _TF_NEED_CUDA,
     ],
 )
 """Detects and configures the NCCL configuration.
diff --git a/third_party/nccl/remote.BUILD.tpl b/third_party/nccl/remote.BUILD.tpl
deleted file mode 100644
index d66fc5563d16edc81c9d883984e438f82e6820ae..0000000000000000000000000000000000000000
--- a/third_party/nccl/remote.BUILD.tpl
+++ /dev/null
@@ -1,6 +0,0 @@
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-alias(name="LICENSE", actual = "%{target}:LICENSE")
-alias(name = "nccl", actual = "%{target}:nccl")
diff --git a/third_party/pasta/BUILD b/third_party/pasta/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..9bd256a57939c402a1f2240f2ddc53f97794c56b
--- /dev/null
+++ b/third_party/pasta/BUILD
@@ -0,0 +1 @@
+# Empty BUILD file to force build system to see this directory at all.
diff --git a/third_party/pasta/BUILD.bazel b/third_party/pasta/BUILD.bazel
new file mode 100644
index 0000000000000000000000000000000000000000..ade681b606953b1df3e0140f83d714a39384c221
--- /dev/null
+++ b/third_party/pasta/BUILD.bazel
@@ -0,0 +1,30 @@
+# Description:
+#   AST-based python refactoring.
+load("@//third_party/pasta:build_defs.bzl", "copy_srcs")
+
+licenses(["notice"])  # Apache2
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "pasta",
+    srcs = copy_srcs([
+        "__init__.py",
+        "augment/__init__.py",
+        "augment/errors.py",
+        "augment/import_utils.py",
+        "augment/inline.py",
+        "augment/rename.py",
+        "base/__init__.py",
+        "base/annotate.py",
+        "base/ast_constants.py",
+        "base/ast_utils.py",
+        "base/codegen.py",
+        "base/formatting.py",
+        "base/scope.py",
+        "base/test_utils.py",
+        "base/token_generator.py",
+    ]),
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/pasta/BUILD.system b/third_party/pasta/BUILD.system
new file mode 100644
index 0000000000000000000000000000000000000000..6adc953c5abdc4bc5495fdf1bceef242a7bac61a
--- /dev/null
+++ b/third_party/pasta/BUILD.system
@@ -0,0 +1,13 @@
+# Description: Pasta, AST based python refactoring.
+
+licenses(["notice"])  # Apache2
+
+filegroup(
+    name = "LICENSE",
+    visibility = ["//visibility:public"],
+)
+
+py_library(
+    name = "pasta",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/pasta/build_defs.bzl b/third_party/pasta/build_defs.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..0a5316de402b8cb6d59ba271400bf4d9bee9f033
--- /dev/null
+++ b/third_party/pasta/build_defs.bzl
@@ -0,0 +1,12 @@
+"""Skylark makros for building pasta."""
+
+def copy_srcs(srcs):
+    """Copies srcs from 'pasta' to parent directory."""
+    for src in srcs:
+        native.genrule(
+            name = src.replace(".", "_"),
+            srcs = ["pasta/" + src],
+            outs = [src],
+            cmd = "mkdir -p $$(dirname $@); cp $< $@",
+        )
+    return srcs
diff --git a/third_party/pasta/workspace.bzl b/third_party/pasta/workspace.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..e46cc4a45e42cc8e9da0e8c8401f05673286686d
--- /dev/null
+++ b/third_party/pasta/workspace.bzl
@@ -0,0 +1,16 @@
+"""Loads pasta python package."""
+
+load("//third_party:repo.bzl", "third_party_http_archive")
+
+def repo():
+    third_party_http_archive(
+        name = "pasta",
+        urls = [
+            "https://mirror.bazel.build/github.com/google/pasta/archive/c3d72cdee6fc806251949e912510444d58d7413c.tar.gz",
+            "https://github.com/google/pasta/archive/c3d72cdee6fc806251949e912510444d58d7413c.tar.gz",
+        ],
+        strip_prefix = "pasta-c3d72cdee6fc806251949e912510444d58d7413c",
+        sha256 = "b5905f9cecc4b28363c563f3c4cb0545288bd35f7cc72c55066e97e53befc084",
+        build_file = "//third_party/pasta:BUILD.bazel",
+        system_build_file = "//third_party/pasta:BUILD.system",
+    )
diff --git a/third_party/toolchains/BUILD b/third_party/toolchains/BUILD
index 9da417fd5fe18619de6dc51032b8e3cde21b6ffb..6ed6e5c3679e6e245a63e8c906cf7be51c8d4f5b 100644
--- a/third_party/toolchains/BUILD
+++ b/third_party/toolchains/BUILD
@@ -4,10 +4,9 @@ package(default_visibility = ["//visibility:public"])
 
 load("//third_party/toolchains/preconfig/generate:containers.bzl", "container_digests")
 
-# Platform for use with remote execution with
-# custom container based off RBE Ubuntu16_04
-# http://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04
-# Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cpu
+# TODO(b/122347293): This is the RBE config based on the CPU configuration / image provided
+# in the asci-toolchain setup. Delete this once we switched CPU remote builds to the
+# new platform below.
 platform(
     name = "rbe_ubuntu16_04-tf",
     constraint_values = [
@@ -23,6 +22,26 @@ platform(
         }""",
 )
 
+# Remote build platforms.
+# Each of the platform rules here provide a platform definition that is bound to a docker image.
+# The result of the skylark configuration is checked into
+# //tensorflow/third_party/toolchains/preconfig.
+
+# Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cpu.
+platform(
+    name = "rbe_ubuntu16.04",
+    constraint_values = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:linux",
+    ],
+    remote_execution_properties = """
+        properties: {
+            name: "container-image"
+            value:"docker://gcr.io/tensorflow-testing/nosla-ubuntu16.04@%s"
+        }""" % container_digests["ubuntu16.04"],
+)
+
+# Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04.
 platform(
     name = "rbe_cuda9.0-cudnn7-ubuntu14.04",
     constraint_values = [
@@ -32,10 +51,11 @@ platform(
     remote_execution_properties = """
         properties: {
             name: "container-image"
-            value:"docker://gcr.io/asci-toolchain/nosla-cuda9.0-cudnn7-ubuntu14.04@%s"
+            value:"docker://gcr.io/tensorflow-testing/nosla-cuda9.0-cudnn7-ubuntu14.04@%s"
         }""" % container_digests["cuda9.0-cudnn7-ubuntu14.04"],
 )
 
+# Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04.
 platform(
     name = "rbe_cuda10.0-cudnn7-ubuntu14.04",
     constraint_values = [
@@ -45,6 +65,6 @@ platform(
     remote_execution_properties = """
         properties: {
             name: "container-image"
-            value:"docker://gcr.io/asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04@%s"
+            value:"docker://gcr.io/tensorflow-testing/nosla-cuda10.0-cudnn7-ubuntu14.04@%s"
         }""" % container_digests["cuda10.0-cudnn7-ubuntu14.04"],
 )
diff --git a/third_party/toolchains/gpus/cuda/BUILD b/third_party/toolchains/gpus/cuda/BUILD
index f63a0ea81925783085b1b551aab778d41ba1fb2c..8bb22c0269b5c4bfc21ea60c6605ac75ba072595 100644
--- a/third_party/toolchains/gpus/cuda/BUILD
+++ b/third_party/toolchains/gpus/cuda/BUILD
@@ -85,8 +85,8 @@ cc_library(
 
 cc_library(
     name = "cudart",
-    srcs = ["cuda/lib/libcudart.so.9.0"],
-    data = ["cuda/lib/libcudart.so.9.0"],
+    srcs = ["cuda/lib/libcudart.so.10.0"],
+    data = ["cuda/lib/libcudart.so.10.0"],
     includes = [
         ".",
         "cuda/include",
@@ -97,8 +97,8 @@ cc_library(
 
 cc_library(
     name = "cublas",
-    srcs = ["cuda/lib/libcublas.so.9.0"],
-    data = ["cuda/lib/libcublas.so.9.0"],
+    srcs = ["cuda/lib/libcublas.so.10.0"],
+    data = ["cuda/lib/libcublas.so.10.0"],
     includes = [
         ".",
         "cuda/include",
@@ -109,8 +109,8 @@ cc_library(
 
 cc_library(
     name = "cusolver",
-    srcs = ["cuda/lib/libcusolver.so.9.0"],
-    data = ["cuda/lib/libcusolver.so.9.0"],
+    srcs = ["cuda/lib/libcusolver.so.10.0"],
+    data = ["cuda/lib/libcusolver.so.10.0"],
     includes = [
         ".",
         "cuda/include",
@@ -143,8 +143,8 @@ cc_library(
 
 cc_library(
     name = "cufft",
-    srcs = ["cuda/lib/libcufft.so.9.0"],
-    data = ["cuda/lib/libcufft.so.9.0"],
+    srcs = ["cuda/lib/libcufft.so.10.0"],
+    data = ["cuda/lib/libcufft.so.10.0"],
     includes = [
         ".",
         "cuda/include",
@@ -155,8 +155,8 @@ cc_library(
 
 cc_library(
     name = "curand",
-    srcs = ["cuda/lib/libcurand.so.9.0"],
-    data = ["cuda/lib/libcurand.so.9.0"],
+    srcs = ["cuda/lib/libcurand.so.10.0"],
+    data = ["cuda/lib/libcurand.so.10.0"],
     includes = [
         ".",
         "cuda/include",
@@ -193,7 +193,7 @@ cc_library(
 
 cc_library(
     name = "cupti_dsos",
-    data = ["cuda/lib/libcupti.so.9.0"],
+    data = ["cuda/lib/libcupti.so.10.0"],
     includes = [
         ".",
         "cuda/include",
@@ -1193,7 +1193,7 @@ genrule(
         "cuda/include/vector_types.h",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/include/CL/cl.h" "$(@D)/cuda/include/CL/cl.h" && cp "/usr/local/cuda-9.0/include/CL/cl.hpp" "$(@D)/cuda/include/CL/cl.hpp" && cp "/usr/local/cuda-9.0/include/CL/cl_egl.h" "$(@D)/cuda/include/CL/cl_egl.h" && cp "/usr/local/cuda-9.0/include/CL/cl_ext.h" "$(@D)/cuda/include/CL/cl_ext.h" && cp "/usr/local/cuda-9.0/include/CL/cl_gl.h" "$(@D)/cuda/include/CL/cl_gl.h" && cp "/usr/local/cuda-9.0/include/CL/cl_gl_ext.h" "$(@D)/cuda/include/CL/cl_gl_ext.h" && cp "/usr/local/cuda-9.0/include/CL/cl_platform.h" "$(@D)/cuda/include/CL/cl_platform.h" && cp "/usr/local/cuda-9.0/include/CL/opencl.h" "$(@D)/cuda/include/CL/opencl.h" && cp "/usr/local/cuda-9.0/include/builtin_types.h" "$(@D)/cuda/include/builtin_types.h" && cp "/usr/local/cuda-9.0/include/channel_descriptor.h" "$(@D)/cuda/include/channel_descriptor.h" && cp "/usr/local/cuda-9.0/include/common_functions.h" "$(@D)/cuda/include/common_functions.h" && cp "/usr/local/cuda-9.0/include/cooperative_groups.h" "$(@D)/cuda/include/cooperative_groups.h" && cp "/usr/local/cuda-9.0/include/cooperative_groups_helpers.h" "$(@D)/cuda/include/cooperative_groups_helpers.h" && cp "/usr/local/cuda-9.0/include/crt/common_functions.h" "$(@D)/cuda/include/crt/common_functions.h" && cp "/usr/local/cuda-9.0/include/crt/device_double_functions.h" "$(@D)/cuda/include/crt/device_double_functions.h" && cp "/usr/local/cuda-9.0/include/crt/device_double_functions.hpp" "$(@D)/cuda/include/crt/device_double_functions.hpp" && cp "/usr/local/cuda-9.0/include/crt/device_functions.h" "$(@D)/cuda/include/crt/device_functions.h" && cp "/usr/local/cuda-9.0/include/crt/device_functions.hpp" "$(@D)/cuda/include/crt/device_functions.hpp" && cp "/usr/local/cuda-9.0/include/crt/func_macro.h" "$(@D)/cuda/include/crt/func_macro.h" && cp "/usr/local/cuda-9.0/include/crt/host_config.h" "$(@D)/cuda/include/crt/host_config.h" && cp "/usr/local/cuda-9.0/include/crt/host_defines.h" "$(@D)/cuda/include/crt/host_defines.h" && cp "/usr/local/cuda-9.0/include/crt/host_runtime.h" "$(@D)/cuda/include/crt/host_runtime.h" && cp "/usr/local/cuda-9.0/include/crt/math_functions.h" "$(@D)/cuda/include/crt/math_functions.h" && cp "/usr/local/cuda-9.0/include/crt/math_functions.hpp" "$(@D)/cuda/include/crt/math_functions.hpp" && cp "/usr/local/cuda-9.0/include/crt/mma.h" "$(@D)/cuda/include/crt/mma.h" && cp "/usr/local/cuda-9.0/include/crt/mma.hpp" "$(@D)/cuda/include/crt/mma.hpp" && cp "/usr/local/cuda-9.0/include/crt/nvfunctional" "$(@D)/cuda/include/crt/nvfunctional" && cp "/usr/local/cuda-9.0/include/crt/sm_70_rt.h" "$(@D)/cuda/include/crt/sm_70_rt.h" && cp "/usr/local/cuda-9.0/include/crt/sm_70_rt.hpp" "$(@D)/cuda/include/crt/sm_70_rt.hpp" && cp "/usr/local/cuda-9.0/include/crt/storage_class.h" "$(@D)/cuda/include/crt/storage_class.h" && cp "/usr/local/cuda-9.0/include/cuComplex.h" "$(@D)/cuda/include/cuComplex.h" && cp "/usr/local/cuda-9.0/include/cublas.h" "$(@D)/cuda/include/cublas.h" && cp "/usr/local/cuda-9.0/include/cublasXt.h" "$(@D)/cuda/include/cublasXt.h" && cp "/usr/local/cuda-9.0/include/cublas_api.h" "$(@D)/cuda/include/cublas_api.h" && cp "/usr/local/cuda-9.0/include/cublas_v2.h" "$(@D)/cuda/include/cublas_v2.h" && cp "/usr/local/cuda-9.0/include/cuda.h" "$(@D)/cuda/include/cuda.h" && cp "/usr/local/cuda-9.0/include/cudaEGL.h" "$(@D)/cuda/include/cudaEGL.h" && cp "/usr/local/cuda-9.0/include/cudaGL.h" "$(@D)/cuda/include/cudaGL.h" && cp "/usr/local/cuda-9.0/include/cudaProfiler.h" "$(@D)/cuda/include/cudaProfiler.h" && cp "/usr/local/cuda-9.0/include/cudaVDPAU.h" "$(@D)/cuda/include/cudaVDPAU.h" && cp "/usr/local/cuda-9.0/include/cuda_device_runtime_api.h" "$(@D)/cuda/include/cuda_device_runtime_api.h" && cp "/usr/local/cuda-9.0/include/cuda_fp16.h" "$(@D)/cuda/include/cuda_fp16.h" && cp "/usr/local/cuda-9.0/include/cuda_fp16.hpp" "$(@D)/cuda/include/cuda_fp16.hpp" && cp "/usr/local/cuda-9.0/include/cuda_gl_interop.h" "$(@D)/cuda/include/cuda_gl_interop.h" && cp "/usr/local/cuda-9.0/include/cuda_occupancy.h" "$(@D)/cuda/include/cuda_occupancy.h" && cp "/usr/local/cuda-9.0/include/cuda_profiler_api.h" "$(@D)/cuda/include/cuda_profiler_api.h" && cp "/usr/local/cuda-9.0/include/cuda_runtime.h" "$(@D)/cuda/include/cuda_runtime.h" && cp "/usr/local/cuda-9.0/include/cuda_runtime_api.h" "$(@D)/cuda/include/cuda_runtime_api.h" && cp "/usr/local/cuda-9.0/include/cuda_surface_types.h" "$(@D)/cuda/include/cuda_surface_types.h" && cp "/usr/local/cuda-9.0/include/cuda_texture_types.h" "$(@D)/cuda/include/cuda_texture_types.h" && cp "/usr/local/cuda-9.0/include/cuda_vdpau_interop.h" "$(@D)/cuda/include/cuda_vdpau_interop.h" && cp "/usr/local/cuda-9.0/include/cudalibxt.h" "$(@D)/cuda/include/cudalibxt.h" && cp "/usr/local/cuda-9.0/include/cudnn.h" "$(@D)/cuda/include/cudnn.h" && cp "/usr/local/cuda-9.0/include/cufft.h" "$(@D)/cuda/include/cufft.h" && cp "/usr/local/cuda-9.0/include/cufftXt.h" "$(@D)/cuda/include/cufftXt.h" && cp "/usr/local/cuda-9.0/include/cufftw.h" "$(@D)/cuda/include/cufftw.h" && cp "/usr/local/cuda-9.0/include/curand.h" "$(@D)/cuda/include/curand.h" && cp "/usr/local/cuda-9.0/include/curand_discrete.h" "$(@D)/cuda/include/curand_discrete.h" && cp "/usr/local/cuda-9.0/include/curand_discrete2.h" "$(@D)/cuda/include/curand_discrete2.h" && cp "/usr/local/cuda-9.0/include/curand_globals.h" "$(@D)/cuda/include/curand_globals.h" && cp "/usr/local/cuda-9.0/include/curand_kernel.h" "$(@D)/cuda/include/curand_kernel.h" && cp "/usr/local/cuda-9.0/include/curand_lognormal.h" "$(@D)/cuda/include/curand_lognormal.h" && cp "/usr/local/cuda-9.0/include/curand_mrg32k3a.h" "$(@D)/cuda/include/curand_mrg32k3a.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32.h" "$(@D)/cuda/include/curand_mtgp32.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32_host.h" "$(@D)/cuda/include/curand_mtgp32_host.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32_kernel.h" "$(@D)/cuda/include/curand_mtgp32_kernel.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32dc_p_11213.h" "$(@D)/cuda/include/curand_mtgp32dc_p_11213.h" && cp "/usr/local/cuda-9.0/include/curand_normal.h" "$(@D)/cuda/include/curand_normal.h" && cp "/usr/local/cuda-9.0/include/curand_normal_static.h" "$(@D)/cuda/include/curand_normal_static.h" && cp "/usr/local/cuda-9.0/include/curand_philox4x32_x.h" "$(@D)/cuda/include/curand_philox4x32_x.h" && cp "/usr/local/cuda-9.0/include/curand_poisson.h" "$(@D)/cuda/include/curand_poisson.h" && cp "/usr/local/cuda-9.0/include/curand_precalc.h" "$(@D)/cuda/include/curand_precalc.h" && cp "/usr/local/cuda-9.0/include/curand_uniform.h" "$(@D)/cuda/include/curand_uniform.h" && cp "/usr/local/cuda-9.0/include/cusolverDn.h" "$(@D)/cuda/include/cusolverDn.h" && cp "/usr/local/cuda-9.0/include/cusolverRf.h" "$(@D)/cuda/include/cusolverRf.h" && cp "/usr/local/cuda-9.0/include/cusolverSp.h" "$(@D)/cuda/include/cusolverSp.h" && cp "/usr/local/cuda-9.0/include/cusolverSp_LOWLEVEL_PREVIEW.h" "$(@D)/cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h" && cp "/usr/local/cuda-9.0/include/cusolver_common.h" "$(@D)/cuda/include/cusolver_common.h" && cp "/usr/local/cuda-9.0/include/cusparse.h" "$(@D)/cuda/include/cusparse.h" && cp "/usr/local/cuda-9.0/include/cusparse_v2.h" "$(@D)/cuda/include/cusparse_v2.h" && cp "/usr/local/cuda-9.0/include/device_atomic_functions.h" "$(@D)/cuda/include/device_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/device_atomic_functions.hpp" "$(@D)/cuda/include/device_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/device_double_functions.h" "$(@D)/cuda/include/device_double_functions.h" && cp "/usr/local/cuda-9.0/include/device_double_functions.hpp" "$(@D)/cuda/include/device_double_functions.hpp" && cp "/usr/local/cuda-9.0/include/device_functions.h" "$(@D)/cuda/include/device_functions.h" && cp "/usr/local/cuda-9.0/include/device_functions.hpp" "$(@D)/cuda/include/device_functions.hpp" && cp "/usr/local/cuda-9.0/include/device_functions_decls.h" "$(@D)/cuda/include/device_functions_decls.h" && cp "/usr/local/cuda-9.0/include/device_launch_parameters.h" "$(@D)/cuda/include/device_launch_parameters.h" && cp "/usr/local/cuda-9.0/include/device_types.h" "$(@D)/cuda/include/device_types.h" && cp "/usr/local/cuda-9.0/include/driver_functions.h" "$(@D)/cuda/include/driver_functions.h" && cp "/usr/local/cuda-9.0/include/driver_types.h" "$(@D)/cuda/include/driver_types.h" && cp "/usr/local/cuda-9.0/include/dynlink_cuda.h" "$(@D)/cuda/include/dynlink_cuda.h" && cp "/usr/local/cuda-9.0/include/dynlink_cuda_cuda.h" "$(@D)/cuda/include/dynlink_cuda_cuda.h" && cp "/usr/local/cuda-9.0/include/dynlink_cuviddec.h" "$(@D)/cuda/include/dynlink_cuviddec.h" && cp "/usr/local/cuda-9.0/include/dynlink_nvcuvid.h" "$(@D)/cuda/include/dynlink_nvcuvid.h" && cp "/usr/local/cuda-9.0/include/fatBinaryCtl.h" "$(@D)/cuda/include/fatBinaryCtl.h" && cp "/usr/local/cuda-9.0/include/fatbinary.h" "$(@D)/cuda/include/fatbinary.h" && cp "/usr/local/cuda-9.0/include/host_config.h" "$(@D)/cuda/include/host_config.h" && cp "/usr/local/cuda-9.0/include/host_defines.h" "$(@D)/cuda/include/host_defines.h" && cp "/usr/local/cuda-9.0/include/library_types.h" "$(@D)/cuda/include/library_types.h" && cp "/usr/local/cuda-9.0/include/math_constants.h" "$(@D)/cuda/include/math_constants.h" && cp "/usr/local/cuda-9.0/include/math_functions.h" "$(@D)/cuda/include/math_functions.h" && cp "/usr/local/cuda-9.0/include/math_functions.hpp" "$(@D)/cuda/include/math_functions.hpp" && cp "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.h" "$(@D)/cuda/include/math_functions_dbl_ptx3.h" && cp "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.hpp" "$(@D)/cuda/include/math_functions_dbl_ptx3.hpp" && cp "/usr/local/cuda-9.0/include/mma.h" "$(@D)/cuda/include/mma.h" && cp "/usr/local/cuda-9.0/include/npp.h" "$(@D)/cuda/include/npp.h" && cp "/usr/local/cuda-9.0/include/nppcore.h" "$(@D)/cuda/include/nppcore.h" && cp "/usr/local/cuda-9.0/include/nppdefs.h" "$(@D)/cuda/include/nppdefs.h" && cp "/usr/local/cuda-9.0/include/nppi.h" "$(@D)/cuda/include/nppi.h" && cp "/usr/local/cuda-9.0/include/nppi_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/nppi_arithmetic_and_logical_operations.h" && cp "/usr/local/cuda-9.0/include/nppi_color_conversion.h" "$(@D)/cuda/include/nppi_color_conversion.h" && cp "/usr/local/cuda-9.0/include/nppi_compression_functions.h" "$(@D)/cuda/include/nppi_compression_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_computer_vision.h" "$(@D)/cuda/include/nppi_computer_vision.h" && cp "/usr/local/cuda-9.0/include/nppi_data_exchange_and_initialization.h" "$(@D)/cuda/include/nppi_data_exchange_and_initialization.h" && cp "/usr/local/cuda-9.0/include/nppi_filtering_functions.h" "$(@D)/cuda/include/nppi_filtering_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_geometry_transforms.h" "$(@D)/cuda/include/nppi_geometry_transforms.h" && cp "/usr/local/cuda-9.0/include/nppi_linear_transforms.h" "$(@D)/cuda/include/nppi_linear_transforms.h" && cp "/usr/local/cuda-9.0/include/nppi_morphological_operations.h" "$(@D)/cuda/include/nppi_morphological_operations.h" && cp "/usr/local/cuda-9.0/include/nppi_statistics_functions.h" "$(@D)/cuda/include/nppi_statistics_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_support_functions.h" "$(@D)/cuda/include/nppi_support_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_threshold_and_compare_operations.h" "$(@D)/cuda/include/nppi_threshold_and_compare_operations.h" && cp "/usr/local/cuda-9.0/include/npps.h" "$(@D)/cuda/include/npps.h" && cp "/usr/local/cuda-9.0/include/npps_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/npps_arithmetic_and_logical_operations.h" && cp "/usr/local/cuda-9.0/include/npps_conversion_functions.h" "$(@D)/cuda/include/npps_conversion_functions.h" && cp "/usr/local/cuda-9.0/include/npps_filtering_functions.h" "$(@D)/cuda/include/npps_filtering_functions.h" && cp "/usr/local/cuda-9.0/include/npps_initialization.h" "$(@D)/cuda/include/npps_initialization.h" && cp "/usr/local/cuda-9.0/include/npps_statistics_functions.h" "$(@D)/cuda/include/npps_statistics_functions.h" && cp "/usr/local/cuda-9.0/include/npps_support_functions.h" "$(@D)/cuda/include/npps_support_functions.h" && cp "/usr/local/cuda-9.0/include/nppversion.h" "$(@D)/cuda/include/nppversion.h" && cp "/usr/local/cuda-9.0/include/nvToolsExt.h" "$(@D)/cuda/include/nvToolsExt.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtCuda.h" "$(@D)/cuda/include/nvToolsExtCuda.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvToolsExtCudaRt.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtMeta.h" "$(@D)/cuda/include/nvToolsExtMeta.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtSync.h" "$(@D)/cuda/include/nvToolsExtSync.h" && cp "/usr/local/cuda-9.0/include/nvblas.h" "$(@D)/cuda/include/nvblas.h" && cp "/usr/local/cuda-9.0/include/nvfunctional" "$(@D)/cuda/include/nvfunctional" && cp "/usr/local/cuda-9.0/include/nvgraph.h" "$(@D)/cuda/include/nvgraph.h" && cp "/usr/local/cuda-9.0/include/nvml.h" "$(@D)/cuda/include/nvml.h" && cp "/usr/local/cuda-9.0/include/nvrtc.h" "$(@D)/cuda/include/nvrtc.h" && cp "/usr/local/cuda-9.0/include/sm_20_atomic_functions.h" "$(@D)/cuda/include/sm_20_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_20_atomic_functions.hpp" "$(@D)/cuda/include/sm_20_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/sm_20_intrinsics.h" "$(@D)/cuda/include/sm_20_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_20_intrinsics.hpp" "$(@D)/cuda/include/sm_20_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sm_30_intrinsics.h" "$(@D)/cuda/include/sm_30_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_30_intrinsics.hpp" "$(@D)/cuda/include/sm_30_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sm_32_atomic_functions.h" "$(@D)/cuda/include/sm_32_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_32_atomic_functions.hpp" "$(@D)/cuda/include/sm_32_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/sm_32_intrinsics.h" "$(@D)/cuda/include/sm_32_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_32_intrinsics.hpp" "$(@D)/cuda/include/sm_32_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sm_35_atomic_functions.h" "$(@D)/cuda/include/sm_35_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_35_intrinsics.h" "$(@D)/cuda/include/sm_35_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_60_atomic_functions.h" "$(@D)/cuda/include/sm_60_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_60_atomic_functions.hpp" "$(@D)/cuda/include/sm_60_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/sm_61_intrinsics.h" "$(@D)/cuda/include/sm_61_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_61_intrinsics.hpp" "$(@D)/cuda/include/sm_61_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sobol_direction_vectors.h" "$(@D)/cuda/include/sobol_direction_vectors.h" && cp "/usr/local/cuda-9.0/include/surface_functions.h" "$(@D)/cuda/include/surface_functions.h" && cp "/usr/local/cuda-9.0/include/surface_functions.hpp" "$(@D)/cuda/include/surface_functions.hpp" && cp "/usr/local/cuda-9.0/include/surface_indirect_functions.h" "$(@D)/cuda/include/surface_indirect_functions.h" && cp "/usr/local/cuda-9.0/include/surface_indirect_functions.hpp" "$(@D)/cuda/include/surface_indirect_functions.hpp" && cp "/usr/local/cuda-9.0/include/surface_types.h" "$(@D)/cuda/include/surface_types.h" && cp "/usr/local/cuda-9.0/include/texture_fetch_functions.h" "$(@D)/cuda/include/texture_fetch_functions.h" && cp "/usr/local/cuda-9.0/include/texture_fetch_functions.hpp" "$(@D)/cuda/include/texture_fetch_functions.hpp" && cp "/usr/local/cuda-9.0/include/texture_indirect_functions.h" "$(@D)/cuda/include/texture_indirect_functions.h" && cp "/usr/local/cuda-9.0/include/texture_indirect_functions.hpp" "$(@D)/cuda/include/texture_indirect_functions.hpp" && cp "/usr/local/cuda-9.0/include/texture_types.h" "$(@D)/cuda/include/texture_types.h" && cp "/usr/local/cuda-9.0/include/thrust/adjacent_difference.h" "$(@D)/cuda/include/thrust/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/advance.h" "$(@D)/cuda/include/thrust/advance.h" && cp "/usr/local/cuda-9.0/include/thrust/binary_search.h" "$(@D)/cuda/include/thrust/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/complex.h" "$(@D)/cuda/include/thrust/complex.h" && cp "/usr/local/cuda-9.0/include/thrust/copy.h" "$(@D)/cuda/include/thrust/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/count.h" "$(@D)/cuda/include/thrust/count.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/adjacent_difference.inl" "$(@D)/cuda/include/thrust/detail/adjacent_difference.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/advance.inl" "$(@D)/cuda/include/thrust/detail/advance.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.h" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.inl" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.h" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/no_throw_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/no_throw_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/binary_search.inl" "$(@D)/cuda/include/thrust/detail/binary_search.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/arithmetic.h" "$(@D)/cuda/include/thrust/detail/complex/arithmetic.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/c99math.h" "$(@D)/cuda/include/thrust/detail/complex/c99math.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/catrig.h" "$(@D)/cuda/include/thrust/detail/complex/catrig.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/catrigf.h" "$(@D)/cuda/include/thrust/detail/complex/catrigf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ccosh.h" "$(@D)/cuda/include/thrust/detail/complex/ccosh.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ccoshf.h" "$(@D)/cuda/include/thrust/detail/complex/ccoshf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cexp.h" "$(@D)/cuda/include/thrust/detail/complex/cexp.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cexpf.h" "$(@D)/cuda/include/thrust/detail/complex/cexpf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/clog.h" "$(@D)/cuda/include/thrust/detail/complex/clog.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/clogf.h" "$(@D)/cuda/include/thrust/detail/complex/clogf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/complex.inl" "$(@D)/cuda/include/thrust/detail/complex/complex.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cpow.h" "$(@D)/cuda/include/thrust/detail/complex/cpow.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cpowf.h" "$(@D)/cuda/include/thrust/detail/complex/cpowf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cproj.h" "$(@D)/cuda/include/thrust/detail/complex/cproj.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csinh.h" "$(@D)/cuda/include/thrust/detail/complex/csinh.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csinhf.h" "$(@D)/cuda/include/thrust/detail/complex/csinhf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrt.h" "$(@D)/cuda/include/thrust/detail/complex/csqrt.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrtf.h" "$(@D)/cuda/include/thrust/detail/complex/csqrtf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanh.h" "$(@D)/cuda/include/thrust/detail/complex/ctanh.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanhf.h" "$(@D)/cuda/include/thrust/detail/complex/ctanhf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/math_private.h" "$(@D)/cuda/include/thrust/detail/complex/math_private.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/stream.h" "$(@D)/cuda/include/thrust/detail/complex/stream.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config.h" "$(@D)/cuda/include/thrust/detail/config.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/compiler.h" "$(@D)/cuda/include/thrust/detail/config/compiler.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/compiler_fence.h" "$(@D)/cuda/include/thrust/detail/config/compiler_fence.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/config.h" "$(@D)/cuda/include/thrust/detail/config/config.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/debug.h" "$(@D)/cuda/include/thrust/detail/config/debug.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/device_system.h" "$(@D)/cuda/include/thrust/detail/config/device_system.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/exec_check_disable.h" "$(@D)/cuda/include/thrust/detail/config/exec_check_disable.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/forceinline.h" "$(@D)/cuda/include/thrust/detail/config/forceinline.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/global_workarounds.h" "$(@D)/cuda/include/thrust/detail/config/global_workarounds.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/host_device.h" "$(@D)/cuda/include/thrust/detail/config/host_device.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/host_system.h" "$(@D)/cuda/include/thrust/detail/config/host_system.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/simple_defines.h" "$(@D)/cuda/include/thrust/detail/config/simple_defines.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.h" "$(@D)/cuda/include/thrust/detail/contiguous_storage.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.inl" "$(@D)/cuda/include/thrust/detail/contiguous_storage.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy.h" "$(@D)/cuda/include/thrust/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy.inl" "$(@D)/cuda/include/thrust/detail/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy_if.h" "$(@D)/cuda/include/thrust/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy_if.inl" "$(@D)/cuda/include/thrust/detail/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/count.inl" "$(@D)/cuda/include/thrust/detail/count.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/cstdint.h" "$(@D)/cuda/include/thrust/detail/cstdint.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_delete.inl" "$(@D)/cuda/include/thrust/detail/device_delete.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_free.inl" "$(@D)/cuda/include/thrust/detail/device_free.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_malloc.inl" "$(@D)/cuda/include/thrust/detail/device_malloc.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_new.inl" "$(@D)/cuda/include/thrust/detail/device_new.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_ptr.inl" "$(@D)/cuda/include/thrust/detail/device_ptr.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_reference.inl" "$(@D)/cuda/include/thrust/detail/device_reference.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_vector.inl" "$(@D)/cuda/include/thrust/detail/device_vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/dispatch/is_trivial_copy.h" "$(@D)/cuda/include/thrust/detail/dispatch/is_trivial_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/distance.inl" "$(@D)/cuda/include/thrust/detail/distance.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/equal.inl" "$(@D)/cuda/include/thrust/detail/equal.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/execute_with_allocator.h" "$(@D)/cuda/include/thrust/detail/execute_with_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/execution_policy.h" "$(@D)/cuda/include/thrust/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/extrema.inl" "$(@D)/cuda/include/thrust/detail/extrema.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/fill.inl" "$(@D)/cuda/include/thrust/detail/fill.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/find.inl" "$(@D)/cuda/include/thrust/detail/find.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/for_each.inl" "$(@D)/cuda/include/thrust/detail/for_each.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/function.h" "$(@D)/cuda/include/thrust/detail/function.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional.inl" "$(@D)/cuda/include/thrust/detail/functional.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.h" "$(@D)/cuda/include/thrust/detail/functional/actor.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.inl" "$(@D)/cuda/include/thrust/detail/functional/actor.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/argument.h" "$(@D)/cuda/include/thrust/detail/functional/argument.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/composite.h" "$(@D)/cuda/include/thrust/detail/functional/composite.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/arithmetic_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/arithmetic_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/assignment_operator.h" "$(@D)/cuda/include/thrust/detail/functional/operators/assignment_operator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/bitwise_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/bitwise_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/compound_assignment_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/logical_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/logical_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/operator_adaptors.h" "$(@D)/cuda/include/thrust/detail/functional/operators/operator_adaptors.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/relational_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/relational_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/placeholder.h" "$(@D)/cuda/include/thrust/detail/functional/placeholder.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/value.h" "$(@D)/cuda/include/thrust/detail/functional/value.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/gather.inl" "$(@D)/cuda/include/thrust/detail/gather.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/generate.inl" "$(@D)/cuda/include/thrust/detail/generate.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/get_iterator_value.h" "$(@D)/cuda/include/thrust/detail/get_iterator_value.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/host_vector.inl" "$(@D)/cuda/include/thrust/detail/host_vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/inner_product.inl" "$(@D)/cuda/include/thrust/detail/inner_product.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/integer_math.h" "$(@D)/cuda/include/thrust/detail/integer_math.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/integer_traits.h" "$(@D)/cuda/include/thrust/detail/integer_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/internal_functional.h" "$(@D)/cuda/include/thrust/detail/internal_functional.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/logical.inl" "$(@D)/cuda/include/thrust/detail/logical.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/merge.inl" "$(@D)/cuda/include/thrust/detail/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/minmax.h" "$(@D)/cuda/include/thrust/detail/minmax.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/mismatch.inl" "$(@D)/cuda/include/thrust/detail/mismatch.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/mpl/math.h" "$(@D)/cuda/include/thrust/detail/mpl/math.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/numeric_traits.h" "$(@D)/cuda/include/thrust/detail/numeric_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/overlapped_copy.h" "$(@D)/cuda/include/thrust/detail/overlapped_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/pair.inl" "$(@D)/cuda/include/thrust/detail/pair.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/partition.inl" "$(@D)/cuda/include/thrust/detail/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/pointer.h" "$(@D)/cuda/include/thrust/detail/pointer.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/pointer.inl" "$(@D)/cuda/include/thrust/detail/pointer.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/range/head_flags.h" "$(@D)/cuda/include/thrust/detail/range/head_flags.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/range/tail_flags.h" "$(@D)/cuda/include/thrust/detail/range/tail_flags.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/raw_pointer_cast.h" "$(@D)/cuda/include/thrust/detail/raw_pointer_cast.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/raw_reference_cast.h" "$(@D)/cuda/include/thrust/detail/raw_reference_cast.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/reduce.inl" "$(@D)/cuda/include/thrust/detail/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/reference.h" "$(@D)/cuda/include/thrust/detail/reference.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/reference.inl" "$(@D)/cuda/include/thrust/detail/reference.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/reference_forward_declaration.h" "$(@D)/cuda/include/thrust/detail/reference_forward_declaration.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/remove.inl" "$(@D)/cuda/include/thrust/detail/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/replace.inl" "$(@D)/cuda/include/thrust/detail/replace.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/reverse.inl" "$(@D)/cuda/include/thrust/detail/reverse.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/scan.inl" "$(@D)/cuda/include/thrust/detail/scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/scatter.inl" "$(@D)/cuda/include/thrust/detail/scatter.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/seq.h" "$(@D)/cuda/include/thrust/detail/seq.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/sequence.inl" "$(@D)/cuda/include/thrust/detail/sequence.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/set_operations.inl" "$(@D)/cuda/include/thrust/detail/set_operations.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/sort.inl" "$(@D)/cuda/include/thrust/detail/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/static_assert.h" "$(@D)/cuda/include/thrust/detail/static_assert.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/static_map.h" "$(@D)/cuda/include/thrust/detail/static_map.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/swap.h" "$(@D)/cuda/include/thrust/detail/swap.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/swap.inl" "$(@D)/cuda/include/thrust/detail/swap.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/swap_ranges.inl" "$(@D)/cuda/include/thrust/detail/swap_ranges.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/tabulate.inl" "$(@D)/cuda/include/thrust/detail/tabulate.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.h" "$(@D)/cuda/include/thrust/detail/temporary_array.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.inl" "$(@D)/cuda/include/thrust/detail/temporary_array.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/transform.inl" "$(@D)/cuda/include/thrust/detail/transform.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/transform_reduce.inl" "$(@D)/cuda/include/thrust/detail/transform_reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/transform_scan.inl" "$(@D)/cuda/include/thrust/detail/transform_scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/trivial_sequence.h" "$(@D)/cuda/include/thrust/detail/trivial_sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/tuple.inl" "$(@D)/cuda/include/thrust/detail/tuple.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/tuple_meta_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_meta_transform.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/tuple_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_transform.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" "$(@D)/cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/function_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/function_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_member_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_member_function.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_nested_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_nested_type.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_trivial_assign.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_trivial_assign.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_call_possible.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_call_possible.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_metafunction_defined.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_metafunction_defined.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_output_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/minimum_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/minimum_type.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/pointer_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/pointer_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/result_of_adaptable_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_fill.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/unique.inl" "$(@D)/cuda/include/thrust/detail/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/use_default.h" "$(@D)/cuda/include/thrust/detail/use_default.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/util/align.h" "$(@D)/cuda/include/thrust/detail/util/align.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/util/blocking.h" "$(@D)/cuda/include/thrust/detail/util/blocking.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/vector_base.h" "$(@D)/cuda/include/thrust/detail/vector_base.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/vector_base.inl" "$(@D)/cuda/include/thrust/detail/vector_base.inl" && cp "/usr/local/cuda-9.0/include/thrust/device_allocator.h" "$(@D)/cuda/include/thrust/device_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/device_delete.h" "$(@D)/cuda/include/thrust/device_delete.h" && cp "/usr/local/cuda-9.0/include/thrust/device_free.h" "$(@D)/cuda/include/thrust/device_free.h" && cp "/usr/local/cuda-9.0/include/thrust/device_malloc.h" "$(@D)/cuda/include/thrust/device_malloc.h" && cp "/usr/local/cuda-9.0/include/thrust/device_malloc_allocator.h" "$(@D)/cuda/include/thrust/device_malloc_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/device_new.h" "$(@D)/cuda/include/thrust/device_new.h" && cp "/usr/local/cuda-9.0/include/thrust/device_new_allocator.h" "$(@D)/cuda/include/thrust/device_new_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/device_ptr.h" "$(@D)/cuda/include/thrust/device_ptr.h" && cp "/usr/local/cuda-9.0/include/thrust/device_reference.h" "$(@D)/cuda/include/thrust/device_reference.h" && cp "/usr/local/cuda-9.0/include/thrust/device_vector.h" "$(@D)/cuda/include/thrust/device_vector.h" && cp "/usr/local/cuda-9.0/include/thrust/distance.h" "$(@D)/cuda/include/thrust/distance.h" && cp "/usr/local/cuda-9.0/include/thrust/equal.h" "$(@D)/cuda/include/thrust/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/execution_policy.h" "$(@D)/cuda/include/thrust/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/extrema.h" "$(@D)/cuda/include/thrust/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/fill.h" "$(@D)/cuda/include/thrust/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/find.h" "$(@D)/cuda/include/thrust/find.h" && cp "/usr/local/cuda-9.0/include/thrust/for_each.h" "$(@D)/cuda/include/thrust/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/functional.h" "$(@D)/cuda/include/thrust/functional.h" && cp "/usr/local/cuda-9.0/include/thrust/gather.h" "$(@D)/cuda/include/thrust/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/generate.h" "$(@D)/cuda/include/thrust/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/host_vector.h" "$(@D)/cuda/include/thrust/host_vector.h" && cp "/usr/local/cuda-9.0/include/thrust/inner_product.h" "$(@D)/cuda/include/thrust/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/constant_iterator.h" "$(@D)/cuda/include/thrust/iterator/constant_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/counting_iterator.h" "$(@D)/cuda/include/thrust/iterator/counting_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_assign.h" "$(@D)/cuda/include/thrust/iterator/detail/any_assign.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/any_system_tag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/constant_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/constant_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/counting_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/counting_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/device_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/device_system_tag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/discard_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/discard_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/distance_from_result.h" "$(@D)/cuda/include/thrust/iterator/detail/distance_from_result.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/host_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/host_system_tag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_iterator_category.h" "$(@D)/cuda/include/thrust/iterator/detail/is_iterator_category.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_trivial_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/is_trivial_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_adaptor_base.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_adaptor_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_system.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_system.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_facade_category.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_facade_category.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traits.inl" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traits.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traversal_tags.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traversal_tags.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/join_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/join_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_category.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_category.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_system.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_system.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/normal_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/normal_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/permutation_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/permutation_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/retag.h" "$(@D)/cuda/include/thrust/iterator/detail/retag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/tagged_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/tagged_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_output_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_output_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/tuple_of_iterator_references.h" "$(@D)/cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/universal_categories.h" "$(@D)/cuda/include/thrust/iterator/detail/universal_categories.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/discard_iterator.h" "$(@D)/cuda/include/thrust/iterator/discard_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_adaptor.h" "$(@D)/cuda/include/thrust/iterator/iterator_adaptor.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_categories.h" "$(@D)/cuda/include/thrust/iterator/iterator_categories.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_facade.h" "$(@D)/cuda/include/thrust/iterator/iterator_facade.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_traits.h" "$(@D)/cuda/include/thrust/iterator/iterator_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/permutation_iterator.h" "$(@D)/cuda/include/thrust/iterator/permutation_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/retag.h" "$(@D)/cuda/include/thrust/iterator/retag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/reverse_iterator.h" "$(@D)/cuda/include/thrust/iterator/reverse_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/transform_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/transform_output_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_output_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/zip_iterator.h" "$(@D)/cuda/include/thrust/iterator/zip_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/logical.h" "$(@D)/cuda/include/thrust/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/memory.h" "$(@D)/cuda/include/thrust/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/merge.h" "$(@D)/cuda/include/thrust/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/mismatch.h" "$(@D)/cuda/include/thrust/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/pair.h" "$(@D)/cuda/include/thrust/pair.h" && cp "/usr/local/cuda-9.0/include/thrust/partition.h" "$(@D)/cuda/include/thrust/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/random.h" "$(@D)/cuda/include/thrust/random.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/discard_block_engine.inl" "$(@D)/cuda/include/thrust/random/detail/discard_block_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine_discard.h" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine_discard.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/mod.h" "$(@D)/cuda/include/thrust/random/detail/mod.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/normal_distribution.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution_base.h" "$(@D)/cuda/include/thrust/random/detail/normal_distribution_base.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/random_core_access.h" "$(@D)/cuda/include/thrust/random/detail/random_core_access.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/subtract_with_carry_engine.inl" "$(@D)/cuda/include/thrust/random/detail/subtract_with_carry_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_int_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_int_distribution.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_real_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_real_distribution.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine.inl" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine_max.h" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine_max.h" && cp "/usr/local/cuda-9.0/include/thrust/random/discard_block_engine.h" "$(@D)/cuda/include/thrust/random/discard_block_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/linear_congruential_engine.h" "$(@D)/cuda/include/thrust/random/linear_congruential_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/linear_feedback_shift_engine.h" "$(@D)/cuda/include/thrust/random/linear_feedback_shift_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/normal_distribution.h" "$(@D)/cuda/include/thrust/random/normal_distribution.h" && cp "/usr/local/cuda-9.0/include/thrust/random/subtract_with_carry_engine.h" "$(@D)/cuda/include/thrust/random/subtract_with_carry_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/uniform_int_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_int_distribution.h" && cp "/usr/local/cuda-9.0/include/thrust/random/uniform_real_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_real_distribution.h" && cp "/usr/local/cuda-9.0/include/thrust/random/xor_combine_engine.h" "$(@D)/cuda/include/thrust/random/xor_combine_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/reduce.h" "$(@D)/cuda/include/thrust/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/remove.h" "$(@D)/cuda/include/thrust/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/replace.h" "$(@D)/cuda/include/thrust/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/reverse.h" "$(@D)/cuda/include/thrust/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/scan.h" "$(@D)/cuda/include/thrust/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/scatter.h" "$(@D)/cuda/include/thrust/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/sequence.h" "$(@D)/cuda/include/thrust/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/set_operations.h" "$(@D)/cuda/include/thrust/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/sort.h" "$(@D)/cuda/include/thrust/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/swap.h" "$(@D)/cuda/include/thrust/swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cpp/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cpp/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/count.h" "$(@D)/cuda/include/thrust/system/cpp/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/equal.h" "$(@D)/cuda/include/thrust/system/cpp/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cpp/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/find.h" "$(@D)/cuda/include/thrust/system/cpp/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cpp/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/gather.h" "$(@D)/cuda/include/thrust/system/cpp/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/generate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cpp/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cpp/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/logical.h" "$(@D)/cuda/include/thrust/system/cpp/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cpp/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/merge.h" "$(@D)/cuda/include/thrust/system/cpp/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cpp/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/par.h" "$(@D)/cuda/include/thrust/system/cpp/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/partition.h" "$(@D)/cuda/include/thrust/system/cpp/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/remove.h" "$(@D)/cuda/include/thrust/system/cpp/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/replace.h" "$(@D)/cuda/include/thrust/system/cpp/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cpp/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sort.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cpp/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cpp/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/memory.h" "$(@D)/cuda/include/thrust/system/cpp/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/vector.h" "$(@D)/cuda/include/thrust/system/cpp/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/config.h" "$(@D)/cuda/include/thrust/system/cuda/config.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cuda/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cuda/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/agent_launcher.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/agent_launcher.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/alignment.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/alignment.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/util.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/count.h" "$(@D)/cuda/include/thrust/system/cuda/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/cross_system.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/cub.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/cub.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_select.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/host/mutex.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_allocator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_arch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_debug.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_device.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_device.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_macro.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_namespace.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_ptx.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_type.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_type.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/equal.h" "$(@D)/cuda/include/thrust/system/cuda/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/error.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/error.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cuda/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/find.h" "$(@D)/cuda/include/thrust/system/cuda/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cuda/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/gather.h" "$(@D)/cuda/include/thrust/system/cuda/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/generate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_driver_types.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_driver_types.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cuda/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cuda/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/logical.h" "$(@D)/cuda/include/thrust/system/cuda/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cuda/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/memory_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/merge.h" "$(@D)/cuda/include/thrust/system/cuda/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par_to_seq.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par_to_seq.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/parallel_for.h" "$(@D)/cuda/include/thrust/system/cuda/detail/parallel_for.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/partition.h" "$(@D)/cuda/include/thrust/system/cuda/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/remove.h" "$(@D)/cuda/include/thrust/system/cuda/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/replace.h" "$(@D)/cuda/include/thrust/system/cuda/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cuda/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cuda/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/terminate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/terminate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/util.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/error.h" "$(@D)/cuda/include/thrust/system/cuda/error.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/experimental/pinned_allocator.h" "$(@D)/cuda/include/thrust/system/cuda/experimental/pinned_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/memory.h" "$(@D)/cuda/include/thrust/system/cuda/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/vector.h" "$(@D)/cuda/include/thrust/system/cuda/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/adl/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/adl/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/count.h" "$(@D)/cuda/include/thrust/system/detail/adl/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/equal.h" "$(@D)/cuda/include/thrust/system/detail/adl/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/extrema.h" "$(@D)/cuda/include/thrust/system/detail/adl/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/find.h" "$(@D)/cuda/include/thrust/system/detail/adl/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/for_each.h" "$(@D)/cuda/include/thrust/system/detail/adl/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/gather.h" "$(@D)/cuda/include/thrust/system/detail/adl/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/generate.h" "$(@D)/cuda/include/thrust/system/detail/adl/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/get_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/adl/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/adl/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/logical.h" "$(@D)/cuda/include/thrust/system/detail/adl/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/adl/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/merge.h" "$(@D)/cuda/include/thrust/system/detail/adl/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/adl/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/partition.h" "$(@D)/cuda/include/thrust/system/detail/adl/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/remove.h" "$(@D)/cuda/include/thrust/system/detail/adl/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/replace.h" "$(@D)/cuda/include/thrust/system/detail/adl/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reverse.h" "$(@D)/cuda/include/thrust/system/detail/adl/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scatter.h" "$(@D)/cuda/include/thrust/system/detail/adl/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sequence.h" "$(@D)/cuda/include/thrust/system/detail/adl/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/adl/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sort.h" "$(@D)/cuda/include/thrust/system/detail/adl/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/adl/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/adl/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/adl/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/bad_alloc.h" "$(@D)/cuda/include/thrust/system/detail/bad_alloc.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/errno.h" "$(@D)/cuda/include/thrust/system/detail/errno.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/error_category.inl" "$(@D)/cuda/include/thrust/system/detail/error_category.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/error_code.inl" "$(@D)/cuda/include/thrust/system/detail/error_code.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/error_condition.inl" "$(@D)/cuda/include/thrust/system/detail/error_condition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.inl" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.h" "$(@D)/cuda/include/thrust/system/detail/generic/advance.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/advance.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.h" "$(@D)/cuda/include/thrust/system/detail/generic/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.inl" "$(@D)/cuda/include/thrust/system/detail/generic/count.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.h" "$(@D)/cuda/include/thrust/system/detail/generic/distance.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/distance.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.h" "$(@D)/cuda/include/thrust/system/detail/generic/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.inl" "$(@D)/cuda/include/thrust/system/detail/generic/equal.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.h" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.inl" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.h" "$(@D)/cuda/include/thrust/system/detail/generic/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.inl" "$(@D)/cuda/include/thrust/system/detail/generic/find.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/for_each.h" "$(@D)/cuda/include/thrust/system/detail/generic/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.h" "$(@D)/cuda/include/thrust/system/detail/generic/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.inl" "$(@D)/cuda/include/thrust/system/detail/generic/gather.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.h" "$(@D)/cuda/include/thrust/system/detail/generic/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/generate.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.inl" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/logical.h" "$(@D)/cuda/include/thrust/system/detail/generic/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.h" "$(@D)/cuda/include/thrust/system/detail/generic/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.inl" "$(@D)/cuda/include/thrust/system/detail/generic/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.h" "$(@D)/cuda/include/thrust/system/detail/generic/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.inl" "$(@D)/cuda/include/thrust/system/detail/generic/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.inl" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.h" "$(@D)/cuda/include/thrust/system/detail/generic/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.inl" "$(@D)/cuda/include/thrust/system/detail/generic/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.h" "$(@D)/cuda/include/thrust/system/detail/generic/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.inl" "$(@D)/cuda/include/thrust/system/detail/generic/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.h" "$(@D)/cuda/include/thrust/system/detail/generic/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.inl" "$(@D)/cuda/include/thrust/system/detail/generic/replace.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.h" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.h" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/select_system.h" "$(@D)/cuda/include/thrust/system/detail/generic/select_system.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.h" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.inl" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.h" "$(@D)/cuda/include/thrust/system/detail/generic/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.inl" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tag.h" "$(@D)/cuda/include/thrust/system/detail/generic/tag.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.inl" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/type_traits.h" "$(@D)/cuda/include/thrust/system/detail/generic/type_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/internal/decompose.h" "$(@D)/cuda/include/thrust/system/detail/internal/decompose.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/sequential/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/sequential/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_backward.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_backward.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/count.h" "$(@D)/cuda/include/thrust/system/detail/sequential/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/equal.h" "$(@D)/cuda/include/thrust/system/detail/sequential/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/execution_policy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/extrema.h" "$(@D)/cuda/include/thrust/system/detail/sequential/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/find.h" "$(@D)/cuda/include/thrust/system/detail/sequential/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/for_each.h" "$(@D)/cuda/include/thrust/system/detail/sequential/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/gather.h" "$(@D)/cuda/include/thrust/system/detail/sequential/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/general_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/general_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/generate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/get_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/sequential/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/insertion_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/insertion_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/sequential/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/logical.h" "$(@D)/cuda/include/thrust/system/detail/sequential/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/sequential/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.h" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/sequential/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/partition.h" "$(@D)/cuda/include/thrust/system/detail/sequential/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/remove.h" "$(@D)/cuda/include/thrust/system/detail/sequential/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/replace.h" "$(@D)/cuda/include/thrust/system/detail/sequential/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reverse.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scatter.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sequence.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/sequential/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/sequential/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/sequential/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/trivial_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/trivial_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/system_error.inl" "$(@D)/cuda/include/thrust/system/detail/system_error.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/error_code.h" "$(@D)/cuda/include/thrust/system/error_code.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/omp/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/omp/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/count.h" "$(@D)/cuda/include/thrust/system/omp/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.h" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/equal.h" "$(@D)/cuda/include/thrust/system/omp/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/omp/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/find.h" "$(@D)/cuda/include/thrust/system/omp/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/gather.h" "$(@D)/cuda/include/thrust/system/omp/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/generate.h" "$(@D)/cuda/include/thrust/system/omp/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/omp/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/omp/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/logical.h" "$(@D)/cuda/include/thrust/system/omp/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/omp/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/omp/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/merge.h" "$(@D)/cuda/include/thrust/system/omp/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/omp/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/par.h" "$(@D)/cuda/include/thrust/system/omp/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.h" "$(@D)/cuda/include/thrust/system/omp/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.h" "$(@D)/cuda/include/thrust/system/omp/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.inl" "$(@D)/cuda/include/thrust/system/omp/detail/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/replace.h" "$(@D)/cuda/include/thrust/system/omp/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/omp/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/omp/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/omp/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/omp/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.h" "$(@D)/cuda/include/thrust/system/omp/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.inl" "$(@D)/cuda/include/thrust/system/omp/detail/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/omp/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/omp/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/omp/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/omp/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/memory.h" "$(@D)/cuda/include/thrust/system/omp/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/vector.h" "$(@D)/cuda/include/thrust/system/omp/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system/system_error.h" "$(@D)/cuda/include/thrust/system/system_error.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/tbb/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/tbb/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/count.h" "$(@D)/cuda/include/thrust/system/tbb/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/equal.h" "$(@D)/cuda/include/thrust/system/tbb/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/extrema.h" "$(@D)/cuda/include/thrust/system/tbb/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/find.h" "$(@D)/cuda/include/thrust/system/tbb/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.h" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/gather.h" "$(@D)/cuda/include/thrust/system/tbb/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/generate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/get_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/tbb/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/tbb/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/logical.h" "$(@D)/cuda/include/thrust/system/tbb/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/tbb/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/memory.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.h" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/tbb/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/par.h" "$(@D)/cuda/include/thrust/system/tbb/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.h" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_intervals.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.h" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/replace.h" "$(@D)/cuda/include/thrust/system/tbb/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reverse.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scatter.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sequence.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/tbb/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/tbb/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/tbb/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/vector.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/memory.h" "$(@D)/cuda/include/thrust/system/tbb/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/vector.h" "$(@D)/cuda/include/thrust/system/tbb/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system_error.h" "$(@D)/cuda/include/thrust/system_error.h" && cp "/usr/local/cuda-9.0/include/thrust/tabulate.h" "$(@D)/cuda/include/thrust/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/transform.h" "$(@D)/cuda/include/thrust/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/transform_reduce.h" "$(@D)/cuda/include/thrust/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/transform_scan.h" "$(@D)/cuda/include/thrust/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/tuple.h" "$(@D)/cuda/include/thrust/tuple.h" && cp "/usr/local/cuda-9.0/include/thrust/uninitialized_copy.h" "$(@D)/cuda/include/thrust/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/uninitialized_fill.h" "$(@D)/cuda/include/thrust/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/unique.h" "$(@D)/cuda/include/thrust/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/version.h" "$(@D)/cuda/include/thrust/version.h" && cp "/usr/local/cuda-9.0/include/vector_functions.h" "$(@D)/cuda/include/vector_functions.h" && cp "/usr/local/cuda-9.0/include/vector_functions.hpp" "$(@D)/cuda/include/vector_functions.hpp" && cp "/usr/local/cuda-9.0/include/vector_types.h" "$(@D)/cuda/include/vector_types.h"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-10.0/include/CL/cl.h" "$(@D)/cuda/include/CL/cl.h" && cp "/usr/local/cuda-10.0/include/CL/cl.hpp" "$(@D)/cuda/include/CL/cl.hpp" && cp "/usr/local/cuda-10.0/include/CL/cl_egl.h" "$(@D)/cuda/include/CL/cl_egl.h" && cp "/usr/local/cuda-10.0/include/CL/cl_ext.h" "$(@D)/cuda/include/CL/cl_ext.h" && cp "/usr/local/cuda-10.0/include/CL/cl_gl.h" "$(@D)/cuda/include/CL/cl_gl.h" && cp "/usr/local/cuda-10.0/include/CL/cl_gl_ext.h" "$(@D)/cuda/include/CL/cl_gl_ext.h" && cp "/usr/local/cuda-10.0/include/CL/cl_platform.h" "$(@D)/cuda/include/CL/cl_platform.h" && cp "/usr/local/cuda-10.0/include/CL/opencl.h" "$(@D)/cuda/include/CL/opencl.h" && cp "/usr/local/cuda-10.0/include/builtin_types.h" "$(@D)/cuda/include/builtin_types.h" && cp "/usr/local/cuda-10.0/include/channel_descriptor.h" "$(@D)/cuda/include/channel_descriptor.h" && cp "/usr/local/cuda-10.0/include/common_functions.h" "$(@D)/cuda/include/common_functions.h" && cp "/usr/local/cuda-10.0/include/cooperative_groups.h" "$(@D)/cuda/include/cooperative_groups.h" && cp "/usr/local/cuda-10.0/include/cooperative_groups_helpers.h" "$(@D)/cuda/include/cooperative_groups_helpers.h" && cp "/usr/local/cuda-10.0/include/crt/common_functions.h" "$(@D)/cuda/include/crt/common_functions.h" && cp "/usr/local/cuda-10.0/include/crt/device_double_functions.h" "$(@D)/cuda/include/crt/device_double_functions.h" && cp "/usr/local/cuda-10.0/include/crt/device_double_functions.hpp" "$(@D)/cuda/include/crt/device_double_functions.hpp" && cp "/usr/local/cuda-10.0/include/crt/device_functions.h" "$(@D)/cuda/include/crt/device_functions.h" && cp "/usr/local/cuda-10.0/include/crt/device_functions.hpp" "$(@D)/cuda/include/crt/device_functions.hpp" && cp "/usr/local/cuda-10.0/include/crt/func_macro.h" "$(@D)/cuda/include/crt/func_macro.h" && cp "/usr/local/cuda-10.0/include/crt/host_config.h" "$(@D)/cuda/include/crt/host_config.h" && cp "/usr/local/cuda-10.0/include/crt/host_defines.h" "$(@D)/cuda/include/crt/host_defines.h" && cp "/usr/local/cuda-10.0/include/crt/host_runtime.h" "$(@D)/cuda/include/crt/host_runtime.h" && cp "/usr/local/cuda-10.0/include/crt/math_functions.h" "$(@D)/cuda/include/crt/math_functions.h" && cp "/usr/local/cuda-10.0/include/crt/math_functions.hpp" "$(@D)/cuda/include/crt/math_functions.hpp" && cp "/usr/local/cuda-10.0/include/crt/mma.h" "$(@D)/cuda/include/crt/mma.h" && cp "/usr/local/cuda-10.0/include/crt/mma.hpp" "$(@D)/cuda/include/crt/mma.hpp" && cp "/usr/local/cuda-10.0/include/crt/nvfunctional" "$(@D)/cuda/include/crt/nvfunctional" && cp "/usr/local/cuda-10.0/include/crt/sm_70_rt.h" "$(@D)/cuda/include/crt/sm_70_rt.h" && cp "/usr/local/cuda-10.0/include/crt/sm_70_rt.hpp" "$(@D)/cuda/include/crt/sm_70_rt.hpp" && cp "/usr/local/cuda-10.0/include/crt/storage_class.h" "$(@D)/cuda/include/crt/storage_class.h" && cp "/usr/local/cuda-10.0/include/cuComplex.h" "$(@D)/cuda/include/cuComplex.h" && cp "/usr/local/cuda-10.0/include/cublas.h" "$(@D)/cuda/include/cublas.h" && cp "/usr/local/cuda-10.0/include/cublasXt.h" "$(@D)/cuda/include/cublasXt.h" && cp "/usr/local/cuda-10.0/include/cublas_api.h" "$(@D)/cuda/include/cublas_api.h" && cp "/usr/local/cuda-10.0/include/cublas_v2.h" "$(@D)/cuda/include/cublas_v2.h" && cp "/usr/local/cuda-10.0/include/cuda.h" "$(@D)/cuda/include/cuda.h" && cp "/usr/local/cuda-10.0/include/cudaEGL.h" "$(@D)/cuda/include/cudaEGL.h" && cp "/usr/local/cuda-10.0/include/cudaGL.h" "$(@D)/cuda/include/cudaGL.h" && cp "/usr/local/cuda-10.0/include/cudaProfiler.h" "$(@D)/cuda/include/cudaProfiler.h" && cp "/usr/local/cuda-10.0/include/cudaVDPAU.h" "$(@D)/cuda/include/cudaVDPAU.h" && cp "/usr/local/cuda-10.0/include/cuda_device_runtime_api.h" "$(@D)/cuda/include/cuda_device_runtime_api.h" && cp "/usr/local/cuda-10.0/include/cuda_fp16.h" "$(@D)/cuda/include/cuda_fp16.h" && cp "/usr/local/cuda-10.0/include/cuda_fp16.hpp" "$(@D)/cuda/include/cuda_fp16.hpp" && cp "/usr/local/cuda-10.0/include/cuda_gl_interop.h" "$(@D)/cuda/include/cuda_gl_interop.h" && cp "/usr/local/cuda-10.0/include/cuda_occupancy.h" "$(@D)/cuda/include/cuda_occupancy.h" && cp "/usr/local/cuda-10.0/include/cuda_profiler_api.h" "$(@D)/cuda/include/cuda_profiler_api.h" && cp "/usr/local/cuda-10.0/include/cuda_runtime.h" "$(@D)/cuda/include/cuda_runtime.h" && cp "/usr/local/cuda-10.0/include/cuda_runtime_api.h" "$(@D)/cuda/include/cuda_runtime_api.h" && cp "/usr/local/cuda-10.0/include/cuda_surface_types.h" "$(@D)/cuda/include/cuda_surface_types.h" && cp "/usr/local/cuda-10.0/include/cuda_texture_types.h" "$(@D)/cuda/include/cuda_texture_types.h" && cp "/usr/local/cuda-10.0/include/cuda_vdpau_interop.h" "$(@D)/cuda/include/cuda_vdpau_interop.h" && cp "/usr/local/cuda-10.0/include/cudalibxt.h" "$(@D)/cuda/include/cudalibxt.h" && cp "/usr/local/cuda-10.0/include/cudnn.h" "$(@D)/cuda/include/cudnn.h" && cp "/usr/local/cuda-10.0/include/cufft.h" "$(@D)/cuda/include/cufft.h" && cp "/usr/local/cuda-10.0/include/cufftXt.h" "$(@D)/cuda/include/cufftXt.h" && cp "/usr/local/cuda-10.0/include/cufftw.h" "$(@D)/cuda/include/cufftw.h" && cp "/usr/local/cuda-10.0/include/curand.h" "$(@D)/cuda/include/curand.h" && cp "/usr/local/cuda-10.0/include/curand_discrete.h" "$(@D)/cuda/include/curand_discrete.h" && cp "/usr/local/cuda-10.0/include/curand_discrete2.h" "$(@D)/cuda/include/curand_discrete2.h" && cp "/usr/local/cuda-10.0/include/curand_globals.h" "$(@D)/cuda/include/curand_globals.h" && cp "/usr/local/cuda-10.0/include/curand_kernel.h" "$(@D)/cuda/include/curand_kernel.h" && cp "/usr/local/cuda-10.0/include/curand_lognormal.h" "$(@D)/cuda/include/curand_lognormal.h" && cp "/usr/local/cuda-10.0/include/curand_mrg32k3a.h" "$(@D)/cuda/include/curand_mrg32k3a.h" && cp "/usr/local/cuda-10.0/include/curand_mtgp32.h" "$(@D)/cuda/include/curand_mtgp32.h" && cp "/usr/local/cuda-10.0/include/curand_mtgp32_host.h" "$(@D)/cuda/include/curand_mtgp32_host.h" && cp "/usr/local/cuda-10.0/include/curand_mtgp32_kernel.h" "$(@D)/cuda/include/curand_mtgp32_kernel.h" && cp "/usr/local/cuda-10.0/include/curand_mtgp32dc_p_11213.h" "$(@D)/cuda/include/curand_mtgp32dc_p_11213.h" && cp "/usr/local/cuda-10.0/include/curand_normal.h" "$(@D)/cuda/include/curand_normal.h" && cp "/usr/local/cuda-10.0/include/curand_normal_static.h" "$(@D)/cuda/include/curand_normal_static.h" && cp "/usr/local/cuda-10.0/include/curand_philox4x32_x.h" "$(@D)/cuda/include/curand_philox4x32_x.h" && cp "/usr/local/cuda-10.0/include/curand_poisson.h" "$(@D)/cuda/include/curand_poisson.h" && cp "/usr/local/cuda-10.0/include/curand_precalc.h" "$(@D)/cuda/include/curand_precalc.h" && cp "/usr/local/cuda-10.0/include/curand_uniform.h" "$(@D)/cuda/include/curand_uniform.h" && cp "/usr/local/cuda-10.0/include/cusolverDn.h" "$(@D)/cuda/include/cusolverDn.h" && cp "/usr/local/cuda-10.0/include/cusolverRf.h" "$(@D)/cuda/include/cusolverRf.h" && cp "/usr/local/cuda-10.0/include/cusolverSp.h" "$(@D)/cuda/include/cusolverSp.h" && cp "/usr/local/cuda-10.0/include/cusolverSp_LOWLEVEL_PREVIEW.h" "$(@D)/cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h" && cp "/usr/local/cuda-10.0/include/cusolver_common.h" "$(@D)/cuda/include/cusolver_common.h" && cp "/usr/local/cuda-10.0/include/cusparse.h" "$(@D)/cuda/include/cusparse.h" && cp "/usr/local/cuda-10.0/include/cusparse_v2.h" "$(@D)/cuda/include/cusparse_v2.h" && cp "/usr/local/cuda-10.0/include/device_atomic_functions.h" "$(@D)/cuda/include/device_atomic_functions.h" && cp "/usr/local/cuda-10.0/include/device_atomic_functions.hpp" "$(@D)/cuda/include/device_atomic_functions.hpp" && cp "/usr/local/cuda-10.0/include/device_double_functions.h" "$(@D)/cuda/include/device_double_functions.h" && cp "/usr/local/cuda-10.0/include/device_double_functions.hpp" "$(@D)/cuda/include/device_double_functions.hpp" && cp "/usr/local/cuda-10.0/include/device_functions.h" "$(@D)/cuda/include/device_functions.h" && cp "/usr/local/cuda-10.0/include/device_functions.hpp" "$(@D)/cuda/include/device_functions.hpp" && cp "/usr/local/cuda-10.0/include/device_functions_decls.h" "$(@D)/cuda/include/device_functions_decls.h" && cp "/usr/local/cuda-10.0/include/device_launch_parameters.h" "$(@D)/cuda/include/device_launch_parameters.h" && cp "/usr/local/cuda-10.0/include/device_types.h" "$(@D)/cuda/include/device_types.h" && cp "/usr/local/cuda-10.0/include/driver_functions.h" "$(@D)/cuda/include/driver_functions.h" && cp "/usr/local/cuda-10.0/include/driver_types.h" "$(@D)/cuda/include/driver_types.h" && cp "/usr/local/cuda-10.0/include/dynlink_cuda.h" "$(@D)/cuda/include/dynlink_cuda.h" && cp "/usr/local/cuda-10.0/include/dynlink_cuda_cuda.h" "$(@D)/cuda/include/dynlink_cuda_cuda.h" && cp "/usr/local/cuda-10.0/include/dynlink_cuviddec.h" "$(@D)/cuda/include/dynlink_cuviddec.h" && cp "/usr/local/cuda-10.0/include/dynlink_nvcuvid.h" "$(@D)/cuda/include/dynlink_nvcuvid.h" && cp "/usr/local/cuda-10.0/include/fatBinaryCtl.h" "$(@D)/cuda/include/fatBinaryCtl.h" && cp "/usr/local/cuda-10.0/include/fatbinary.h" "$(@D)/cuda/include/fatbinary.h" && cp "/usr/local/cuda-10.0/include/host_config.h" "$(@D)/cuda/include/host_config.h" && cp "/usr/local/cuda-10.0/include/host_defines.h" "$(@D)/cuda/include/host_defines.h" && cp "/usr/local/cuda-10.0/include/library_types.h" "$(@D)/cuda/include/library_types.h" && cp "/usr/local/cuda-10.0/include/math_constants.h" "$(@D)/cuda/include/math_constants.h" && cp "/usr/local/cuda-10.0/include/math_functions.h" "$(@D)/cuda/include/math_functions.h" && cp "/usr/local/cuda-10.0/include/math_functions.hpp" "$(@D)/cuda/include/math_functions.hpp" && cp "/usr/local/cuda-10.0/include/math_functions_dbl_ptx3.h" "$(@D)/cuda/include/math_functions_dbl_ptx3.h" && cp "/usr/local/cuda-10.0/include/math_functions_dbl_ptx3.hpp" "$(@D)/cuda/include/math_functions_dbl_ptx3.hpp" && cp "/usr/local/cuda-10.0/include/mma.h" "$(@D)/cuda/include/mma.h" && cp "/usr/local/cuda-10.0/include/npp.h" "$(@D)/cuda/include/npp.h" && cp "/usr/local/cuda-10.0/include/nppcore.h" "$(@D)/cuda/include/nppcore.h" && cp "/usr/local/cuda-10.0/include/nppdefs.h" "$(@D)/cuda/include/nppdefs.h" && cp "/usr/local/cuda-10.0/include/nppi.h" "$(@D)/cuda/include/nppi.h" && cp "/usr/local/cuda-10.0/include/nppi_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/nppi_arithmetic_and_logical_operations.h" && cp "/usr/local/cuda-10.0/include/nppi_color_conversion.h" "$(@D)/cuda/include/nppi_color_conversion.h" && cp "/usr/local/cuda-10.0/include/nppi_compression_functions.h" "$(@D)/cuda/include/nppi_compression_functions.h" && cp "/usr/local/cuda-10.0/include/nppi_computer_vision.h" "$(@D)/cuda/include/nppi_computer_vision.h" && cp "/usr/local/cuda-10.0/include/nppi_data_exchange_and_initialization.h" "$(@D)/cuda/include/nppi_data_exchange_and_initialization.h" && cp "/usr/local/cuda-10.0/include/nppi_filtering_functions.h" "$(@D)/cuda/include/nppi_filtering_functions.h" && cp "/usr/local/cuda-10.0/include/nppi_geometry_transforms.h" "$(@D)/cuda/include/nppi_geometry_transforms.h" && cp "/usr/local/cuda-10.0/include/nppi_linear_transforms.h" "$(@D)/cuda/include/nppi_linear_transforms.h" && cp "/usr/local/cuda-10.0/include/nppi_morphological_operations.h" "$(@D)/cuda/include/nppi_morphological_operations.h" && cp "/usr/local/cuda-10.0/include/nppi_statistics_functions.h" "$(@D)/cuda/include/nppi_statistics_functions.h" && cp "/usr/local/cuda-10.0/include/nppi_support_functions.h" "$(@D)/cuda/include/nppi_support_functions.h" && cp "/usr/local/cuda-10.0/include/nppi_threshold_and_compare_operations.h" "$(@D)/cuda/include/nppi_threshold_and_compare_operations.h" && cp "/usr/local/cuda-10.0/include/npps.h" "$(@D)/cuda/include/npps.h" && cp "/usr/local/cuda-10.0/include/npps_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/npps_arithmetic_and_logical_operations.h" && cp "/usr/local/cuda-10.0/include/npps_conversion_functions.h" "$(@D)/cuda/include/npps_conversion_functions.h" && cp "/usr/local/cuda-10.0/include/npps_filtering_functions.h" "$(@D)/cuda/include/npps_filtering_functions.h" && cp "/usr/local/cuda-10.0/include/npps_initialization.h" "$(@D)/cuda/include/npps_initialization.h" && cp "/usr/local/cuda-10.0/include/npps_statistics_functions.h" "$(@D)/cuda/include/npps_statistics_functions.h" && cp "/usr/local/cuda-10.0/include/npps_support_functions.h" "$(@D)/cuda/include/npps_support_functions.h" && cp "/usr/local/cuda-10.0/include/nppversion.h" "$(@D)/cuda/include/nppversion.h" && cp "/usr/local/cuda-10.0/include/nvToolsExt.h" "$(@D)/cuda/include/nvToolsExt.h" && cp "/usr/local/cuda-10.0/include/nvToolsExtCuda.h" "$(@D)/cuda/include/nvToolsExtCuda.h" && cp "/usr/local/cuda-10.0/include/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvToolsExtCudaRt.h" && cp "/usr/local/cuda-10.0/include/nvToolsExtMeta.h" "$(@D)/cuda/include/nvToolsExtMeta.h" && cp "/usr/local/cuda-10.0/include/nvToolsExtSync.h" "$(@D)/cuda/include/nvToolsExtSync.h" && cp "/usr/local/cuda-10.0/include/nvblas.h" "$(@D)/cuda/include/nvblas.h" && cp "/usr/local/cuda-10.0/include/nvfunctional" "$(@D)/cuda/include/nvfunctional" && cp "/usr/local/cuda-10.0/include/nvgraph.h" "$(@D)/cuda/include/nvgraph.h" && cp "/usr/local/cuda-10.0/include/nvml.h" "$(@D)/cuda/include/nvml.h" && cp "/usr/local/cuda-10.0/include/nvrtc.h" "$(@D)/cuda/include/nvrtc.h" && cp "/usr/local/cuda-10.0/include/sm_20_atomic_functions.h" "$(@D)/cuda/include/sm_20_atomic_functions.h" && cp "/usr/local/cuda-10.0/include/sm_20_atomic_functions.hpp" "$(@D)/cuda/include/sm_20_atomic_functions.hpp" && cp "/usr/local/cuda-10.0/include/sm_20_intrinsics.h" "$(@D)/cuda/include/sm_20_intrinsics.h" && cp "/usr/local/cuda-10.0/include/sm_20_intrinsics.hpp" "$(@D)/cuda/include/sm_20_intrinsics.hpp" && cp "/usr/local/cuda-10.0/include/sm_30_intrinsics.h" "$(@D)/cuda/include/sm_30_intrinsics.h" && cp "/usr/local/cuda-10.0/include/sm_30_intrinsics.hpp" "$(@D)/cuda/include/sm_30_intrinsics.hpp" && cp "/usr/local/cuda-10.0/include/sm_32_atomic_functions.h" "$(@D)/cuda/include/sm_32_atomic_functions.h" && cp "/usr/local/cuda-10.0/include/sm_32_atomic_functions.hpp" "$(@D)/cuda/include/sm_32_atomic_functions.hpp" && cp "/usr/local/cuda-10.0/include/sm_32_intrinsics.h" "$(@D)/cuda/include/sm_32_intrinsics.h" && cp "/usr/local/cuda-10.0/include/sm_32_intrinsics.hpp" "$(@D)/cuda/include/sm_32_intrinsics.hpp" && cp "/usr/local/cuda-10.0/include/sm_35_atomic_functions.h" "$(@D)/cuda/include/sm_35_atomic_functions.h" && cp "/usr/local/cuda-10.0/include/sm_35_intrinsics.h" "$(@D)/cuda/include/sm_35_intrinsics.h" && cp "/usr/local/cuda-10.0/include/sm_60_atomic_functions.h" "$(@D)/cuda/include/sm_60_atomic_functions.h" && cp "/usr/local/cuda-10.0/include/sm_60_atomic_functions.hpp" "$(@D)/cuda/include/sm_60_atomic_functions.hpp" && cp "/usr/local/cuda-10.0/include/sm_61_intrinsics.h" "$(@D)/cuda/include/sm_61_intrinsics.h" && cp "/usr/local/cuda-10.0/include/sm_61_intrinsics.hpp" "$(@D)/cuda/include/sm_61_intrinsics.hpp" && cp "/usr/local/cuda-10.0/include/sobol_direction_vectors.h" "$(@D)/cuda/include/sobol_direction_vectors.h" && cp "/usr/local/cuda-10.0/include/surface_functions.h" "$(@D)/cuda/include/surface_functions.h" && cp "/usr/local/cuda-10.0/include/surface_functions.hpp" "$(@D)/cuda/include/surface_functions.hpp" && cp "/usr/local/cuda-10.0/include/surface_indirect_functions.h" "$(@D)/cuda/include/surface_indirect_functions.h" && cp "/usr/local/cuda-10.0/include/surface_indirect_functions.hpp" "$(@D)/cuda/include/surface_indirect_functions.hpp" && cp "/usr/local/cuda-10.0/include/surface_types.h" "$(@D)/cuda/include/surface_types.h" && cp "/usr/local/cuda-10.0/include/texture_fetch_functions.h" "$(@D)/cuda/include/texture_fetch_functions.h" && cp "/usr/local/cuda-10.0/include/texture_fetch_functions.hpp" "$(@D)/cuda/include/texture_fetch_functions.hpp" && cp "/usr/local/cuda-10.0/include/texture_indirect_functions.h" "$(@D)/cuda/include/texture_indirect_functions.h" && cp "/usr/local/cuda-10.0/include/texture_indirect_functions.hpp" "$(@D)/cuda/include/texture_indirect_functions.hpp" && cp "/usr/local/cuda-10.0/include/texture_types.h" "$(@D)/cuda/include/texture_types.h" && cp "/usr/local/cuda-10.0/include/thrust/adjacent_difference.h" "$(@D)/cuda/include/thrust/adjacent_difference.h" && cp "/usr/local/cuda-10.0/include/thrust/advance.h" "$(@D)/cuda/include/thrust/advance.h" && cp "/usr/local/cuda-10.0/include/thrust/binary_search.h" "$(@D)/cuda/include/thrust/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/complex.h" "$(@D)/cuda/include/thrust/complex.h" && cp "/usr/local/cuda-10.0/include/thrust/copy.h" "$(@D)/cuda/include/thrust/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/count.h" "$(@D)/cuda/include/thrust/count.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/adjacent_difference.inl" "$(@D)/cuda/include/thrust/detail/adjacent_difference.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/advance.inl" "$(@D)/cuda/include/thrust/detail/advance.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/allocator_traits.h" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/allocator_traits.inl" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/copy_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/copy_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/default_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/default_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/destroy_range.h" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/destroy_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/fill_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/fill_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/malloc_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/malloc_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/no_throw_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/no_throw_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/tagged_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/tagged_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/temporary_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/temporary_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/binary_search.inl" "$(@D)/cuda/include/thrust/detail/binary_search.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/arithmetic.h" "$(@D)/cuda/include/thrust/detail/complex/arithmetic.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/c99math.h" "$(@D)/cuda/include/thrust/detail/complex/c99math.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/catrig.h" "$(@D)/cuda/include/thrust/detail/complex/catrig.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/catrigf.h" "$(@D)/cuda/include/thrust/detail/complex/catrigf.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/ccosh.h" "$(@D)/cuda/include/thrust/detail/complex/ccosh.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/ccoshf.h" "$(@D)/cuda/include/thrust/detail/complex/ccoshf.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/cexp.h" "$(@D)/cuda/include/thrust/detail/complex/cexp.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/cexpf.h" "$(@D)/cuda/include/thrust/detail/complex/cexpf.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/clog.h" "$(@D)/cuda/include/thrust/detail/complex/clog.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/clogf.h" "$(@D)/cuda/include/thrust/detail/complex/clogf.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/complex.inl" "$(@D)/cuda/include/thrust/detail/complex/complex.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/cpow.h" "$(@D)/cuda/include/thrust/detail/complex/cpow.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/cpowf.h" "$(@D)/cuda/include/thrust/detail/complex/cpowf.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/cproj.h" "$(@D)/cuda/include/thrust/detail/complex/cproj.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/csinh.h" "$(@D)/cuda/include/thrust/detail/complex/csinh.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/csinhf.h" "$(@D)/cuda/include/thrust/detail/complex/csinhf.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/csqrt.h" "$(@D)/cuda/include/thrust/detail/complex/csqrt.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/csqrtf.h" "$(@D)/cuda/include/thrust/detail/complex/csqrtf.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/ctanh.h" "$(@D)/cuda/include/thrust/detail/complex/ctanh.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/ctanhf.h" "$(@D)/cuda/include/thrust/detail/complex/ctanhf.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/math_private.h" "$(@D)/cuda/include/thrust/detail/complex/math_private.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/stream.h" "$(@D)/cuda/include/thrust/detail/complex/stream.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config.h" "$(@D)/cuda/include/thrust/detail/config.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/compiler.h" "$(@D)/cuda/include/thrust/detail/config/compiler.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/compiler_fence.h" "$(@D)/cuda/include/thrust/detail/config/compiler_fence.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/config.h" "$(@D)/cuda/include/thrust/detail/config/config.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/debug.h" "$(@D)/cuda/include/thrust/detail/config/debug.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/device_system.h" "$(@D)/cuda/include/thrust/detail/config/device_system.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/exec_check_disable.h" "$(@D)/cuda/include/thrust/detail/config/exec_check_disable.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/forceinline.h" "$(@D)/cuda/include/thrust/detail/config/forceinline.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/global_workarounds.h" "$(@D)/cuda/include/thrust/detail/config/global_workarounds.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/host_device.h" "$(@D)/cuda/include/thrust/detail/config/host_device.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/host_system.h" "$(@D)/cuda/include/thrust/detail/config/host_system.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/simple_defines.h" "$(@D)/cuda/include/thrust/detail/config/simple_defines.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/contiguous_storage.h" "$(@D)/cuda/include/thrust/detail/contiguous_storage.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/contiguous_storage.inl" "$(@D)/cuda/include/thrust/detail/contiguous_storage.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/copy.h" "$(@D)/cuda/include/thrust/detail/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/copy.inl" "$(@D)/cuda/include/thrust/detail/copy.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/copy_if.h" "$(@D)/cuda/include/thrust/detail/copy_if.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/copy_if.inl" "$(@D)/cuda/include/thrust/detail/copy_if.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/count.inl" "$(@D)/cuda/include/thrust/detail/count.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/cstdint.h" "$(@D)/cuda/include/thrust/detail/cstdint.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/device_delete.inl" "$(@D)/cuda/include/thrust/detail/device_delete.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/device_free.inl" "$(@D)/cuda/include/thrust/detail/device_free.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/device_malloc.inl" "$(@D)/cuda/include/thrust/detail/device_malloc.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/device_new.inl" "$(@D)/cuda/include/thrust/detail/device_new.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/device_ptr.inl" "$(@D)/cuda/include/thrust/detail/device_ptr.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/device_reference.inl" "$(@D)/cuda/include/thrust/detail/device_reference.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/device_vector.inl" "$(@D)/cuda/include/thrust/detail/device_vector.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/dispatch/is_trivial_copy.h" "$(@D)/cuda/include/thrust/detail/dispatch/is_trivial_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/distance.inl" "$(@D)/cuda/include/thrust/detail/distance.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/equal.inl" "$(@D)/cuda/include/thrust/detail/equal.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/execute_with_allocator.h" "$(@D)/cuda/include/thrust/detail/execute_with_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/execution_policy.h" "$(@D)/cuda/include/thrust/detail/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/extrema.inl" "$(@D)/cuda/include/thrust/detail/extrema.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/fill.inl" "$(@D)/cuda/include/thrust/detail/fill.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/find.inl" "$(@D)/cuda/include/thrust/detail/find.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/for_each.inl" "$(@D)/cuda/include/thrust/detail/for_each.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/function.h" "$(@D)/cuda/include/thrust/detail/function.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional.inl" "$(@D)/cuda/include/thrust/detail/functional.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/actor.h" "$(@D)/cuda/include/thrust/detail/functional/actor.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/actor.inl" "$(@D)/cuda/include/thrust/detail/functional/actor.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/argument.h" "$(@D)/cuda/include/thrust/detail/functional/argument.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/composite.h" "$(@D)/cuda/include/thrust/detail/functional/composite.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/arithmetic_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/arithmetic_operators.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/assignment_operator.h" "$(@D)/cuda/include/thrust/detail/functional/operators/assignment_operator.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/bitwise_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/bitwise_operators.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/compound_assignment_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/logical_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/logical_operators.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/operator_adaptors.h" "$(@D)/cuda/include/thrust/detail/functional/operators/operator_adaptors.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/relational_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/relational_operators.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/placeholder.h" "$(@D)/cuda/include/thrust/detail/functional/placeholder.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/value.h" "$(@D)/cuda/include/thrust/detail/functional/value.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/gather.inl" "$(@D)/cuda/include/thrust/detail/gather.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/generate.inl" "$(@D)/cuda/include/thrust/detail/generate.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/get_iterator_value.h" "$(@D)/cuda/include/thrust/detail/get_iterator_value.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/host_vector.inl" "$(@D)/cuda/include/thrust/detail/host_vector.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/inner_product.inl" "$(@D)/cuda/include/thrust/detail/inner_product.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/integer_math.h" "$(@D)/cuda/include/thrust/detail/integer_math.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/integer_traits.h" "$(@D)/cuda/include/thrust/detail/integer_traits.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/internal_functional.h" "$(@D)/cuda/include/thrust/detail/internal_functional.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/logical.inl" "$(@D)/cuda/include/thrust/detail/logical.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/detail/malloc_and_free.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/merge.inl" "$(@D)/cuda/include/thrust/detail/merge.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/minmax.h" "$(@D)/cuda/include/thrust/detail/minmax.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/mismatch.inl" "$(@D)/cuda/include/thrust/detail/mismatch.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/mpl/math.h" "$(@D)/cuda/include/thrust/detail/mpl/math.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/numeric_traits.h" "$(@D)/cuda/include/thrust/detail/numeric_traits.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/overlapped_copy.h" "$(@D)/cuda/include/thrust/detail/overlapped_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/pair.inl" "$(@D)/cuda/include/thrust/detail/pair.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/partition.inl" "$(@D)/cuda/include/thrust/detail/partition.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/pointer.h" "$(@D)/cuda/include/thrust/detail/pointer.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/pointer.inl" "$(@D)/cuda/include/thrust/detail/pointer.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/range/head_flags.h" "$(@D)/cuda/include/thrust/detail/range/head_flags.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/range/tail_flags.h" "$(@D)/cuda/include/thrust/detail/range/tail_flags.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/raw_pointer_cast.h" "$(@D)/cuda/include/thrust/detail/raw_pointer_cast.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/raw_reference_cast.h" "$(@D)/cuda/include/thrust/detail/raw_reference_cast.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/reduce.inl" "$(@D)/cuda/include/thrust/detail/reduce.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/reference.h" "$(@D)/cuda/include/thrust/detail/reference.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/reference.inl" "$(@D)/cuda/include/thrust/detail/reference.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/reference_forward_declaration.h" "$(@D)/cuda/include/thrust/detail/reference_forward_declaration.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/remove.inl" "$(@D)/cuda/include/thrust/detail/remove.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/replace.inl" "$(@D)/cuda/include/thrust/detail/replace.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/reverse.inl" "$(@D)/cuda/include/thrust/detail/reverse.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/scan.inl" "$(@D)/cuda/include/thrust/detail/scan.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/scatter.inl" "$(@D)/cuda/include/thrust/detail/scatter.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/seq.h" "$(@D)/cuda/include/thrust/detail/seq.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/sequence.inl" "$(@D)/cuda/include/thrust/detail/sequence.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/set_operations.inl" "$(@D)/cuda/include/thrust/detail/set_operations.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/sort.inl" "$(@D)/cuda/include/thrust/detail/sort.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/static_assert.h" "$(@D)/cuda/include/thrust/detail/static_assert.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/static_map.h" "$(@D)/cuda/include/thrust/detail/static_map.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/swap.h" "$(@D)/cuda/include/thrust/detail/swap.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/swap.inl" "$(@D)/cuda/include/thrust/detail/swap.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/swap_ranges.inl" "$(@D)/cuda/include/thrust/detail/swap_ranges.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/tabulate.inl" "$(@D)/cuda/include/thrust/detail/tabulate.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/temporary_array.h" "$(@D)/cuda/include/thrust/detail/temporary_array.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/temporary_array.inl" "$(@D)/cuda/include/thrust/detail/temporary_array.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/detail/temporary_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/transform.inl" "$(@D)/cuda/include/thrust/detail/transform.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/transform_reduce.inl" "$(@D)/cuda/include/thrust/detail/transform_reduce.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/transform_scan.inl" "$(@D)/cuda/include/thrust/detail/transform_scan.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/trivial_sequence.h" "$(@D)/cuda/include/thrust/detail/trivial_sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/tuple.inl" "$(@D)/cuda/include/thrust/detail/tuple.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/tuple_meta_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_meta_transform.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/tuple_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_transform.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" "$(@D)/cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/function_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/function_traits.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/has_member_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_member_function.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/has_nested_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_nested_type.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/has_trivial_assign.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_trivial_assign.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/is_call_possible.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_call_possible.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/is_metafunction_defined.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_metafunction_defined.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/iterator/is_output_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/minimum_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/minimum_type.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/pointer_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/pointer_traits.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/result_of_adaptable_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_copy.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_fill.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/unique.inl" "$(@D)/cuda/include/thrust/detail/unique.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/use_default.h" "$(@D)/cuda/include/thrust/detail/use_default.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/util/align.h" "$(@D)/cuda/include/thrust/detail/util/align.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/util/blocking.h" "$(@D)/cuda/include/thrust/detail/util/blocking.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/vector_base.h" "$(@D)/cuda/include/thrust/detail/vector_base.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/vector_base.inl" "$(@D)/cuda/include/thrust/detail/vector_base.inl" && cp "/usr/local/cuda-10.0/include/thrust/device_allocator.h" "$(@D)/cuda/include/thrust/device_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/device_delete.h" "$(@D)/cuda/include/thrust/device_delete.h" && cp "/usr/local/cuda-10.0/include/thrust/device_free.h" "$(@D)/cuda/include/thrust/device_free.h" && cp "/usr/local/cuda-10.0/include/thrust/device_malloc.h" "$(@D)/cuda/include/thrust/device_malloc.h" && cp "/usr/local/cuda-10.0/include/thrust/device_malloc_allocator.h" "$(@D)/cuda/include/thrust/device_malloc_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/device_new.h" "$(@D)/cuda/include/thrust/device_new.h" && cp "/usr/local/cuda-10.0/include/thrust/device_new_allocator.h" "$(@D)/cuda/include/thrust/device_new_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/device_ptr.h" "$(@D)/cuda/include/thrust/device_ptr.h" && cp "/usr/local/cuda-10.0/include/thrust/device_reference.h" "$(@D)/cuda/include/thrust/device_reference.h" && cp "/usr/local/cuda-10.0/include/thrust/device_vector.h" "$(@D)/cuda/include/thrust/device_vector.h" && cp "/usr/local/cuda-10.0/include/thrust/distance.h" "$(@D)/cuda/include/thrust/distance.h" && cp "/usr/local/cuda-10.0/include/thrust/equal.h" "$(@D)/cuda/include/thrust/equal.h" && cp "/usr/local/cuda-10.0/include/thrust/execution_policy.h" "$(@D)/cuda/include/thrust/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/extrema.h" "$(@D)/cuda/include/thrust/extrema.h" && cp "/usr/local/cuda-10.0/include/thrust/fill.h" "$(@D)/cuda/include/thrust/fill.h" && cp "/usr/local/cuda-10.0/include/thrust/find.h" "$(@D)/cuda/include/thrust/find.h" && cp "/usr/local/cuda-10.0/include/thrust/for_each.h" "$(@D)/cuda/include/thrust/for_each.h" && cp "/usr/local/cuda-10.0/include/thrust/functional.h" "$(@D)/cuda/include/thrust/functional.h" && cp "/usr/local/cuda-10.0/include/thrust/gather.h" "$(@D)/cuda/include/thrust/gather.h" && cp "/usr/local/cuda-10.0/include/thrust/generate.h" "$(@D)/cuda/include/thrust/generate.h" && cp "/usr/local/cuda-10.0/include/thrust/host_vector.h" "$(@D)/cuda/include/thrust/host_vector.h" && cp "/usr/local/cuda-10.0/include/thrust/inner_product.h" "$(@D)/cuda/include/thrust/inner_product.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/constant_iterator.h" "$(@D)/cuda/include/thrust/iterator/constant_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/counting_iterator.h" "$(@D)/cuda/include/thrust/iterator/counting_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/any_assign.h" "$(@D)/cuda/include/thrust/iterator/detail/any_assign.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/any_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/any_system_tag.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/constant_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/constant_iterator_base.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/counting_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/counting_iterator.inl" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/device_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/device_system_tag.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/discard_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/discard_iterator_base.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/distance_from_result.h" "$(@D)/cuda/include/thrust/iterator/detail/distance_from_result.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/host_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/host_system_tag.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/is_iterator_category.h" "$(@D)/cuda/include/thrust/iterator/detail/is_iterator_category.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/is_trivial_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/is_trivial_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_adaptor_base.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_adaptor_base.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_category_to_system.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_system.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_category_to_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_facade_category.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_facade_category.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_traits.inl" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traits.inl" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_traversal_tags.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traversal_tags.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/join_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/join_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/minimum_category.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_category.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/minimum_system.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_system.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/normal_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/normal_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/permutation_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/permutation_iterator_base.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/retag.h" "$(@D)/cuda/include/thrust/iterator/detail/retag.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/reverse_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator.inl" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/reverse_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator_base.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/tagged_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/tagged_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/transform_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_iterator.inl" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/transform_output_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_output_iterator.inl" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/tuple_of_iterator_references.h" "$(@D)/cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/universal_categories.h" "$(@D)/cuda/include/thrust/iterator/detail/universal_categories.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/zip_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator.inl" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/zip_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator_base.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/discard_iterator.h" "$(@D)/cuda/include/thrust/iterator/discard_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/iterator_adaptor.h" "$(@D)/cuda/include/thrust/iterator/iterator_adaptor.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/iterator_categories.h" "$(@D)/cuda/include/thrust/iterator/iterator_categories.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/iterator_facade.h" "$(@D)/cuda/include/thrust/iterator/iterator_facade.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/iterator_traits.h" "$(@D)/cuda/include/thrust/iterator/iterator_traits.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/permutation_iterator.h" "$(@D)/cuda/include/thrust/iterator/permutation_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/retag.h" "$(@D)/cuda/include/thrust/iterator/retag.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/reverse_iterator.h" "$(@D)/cuda/include/thrust/iterator/reverse_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/transform_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/transform_output_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_output_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/zip_iterator.h" "$(@D)/cuda/include/thrust/iterator/zip_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/logical.h" "$(@D)/cuda/include/thrust/logical.h" && cp "/usr/local/cuda-10.0/include/thrust/memory.h" "$(@D)/cuda/include/thrust/memory.h" && cp "/usr/local/cuda-10.0/include/thrust/merge.h" "$(@D)/cuda/include/thrust/merge.h" && cp "/usr/local/cuda-10.0/include/thrust/mismatch.h" "$(@D)/cuda/include/thrust/mismatch.h" && cp "/usr/local/cuda-10.0/include/thrust/pair.h" "$(@D)/cuda/include/thrust/pair.h" && cp "/usr/local/cuda-10.0/include/thrust/partition.h" "$(@D)/cuda/include/thrust/partition.h" && cp "/usr/local/cuda-10.0/include/thrust/random.h" "$(@D)/cuda/include/thrust/random.h" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/discard_block_engine.inl" "$(@D)/cuda/include/thrust/random/detail/discard_block_engine.inl" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/linear_congruential_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine.inl" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/linear_congruential_engine_discard.h" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine_discard.h" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/linear_feedback_shift_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/mod.h" "$(@D)/cuda/include/thrust/random/detail/mod.h" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/normal_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/normal_distribution.inl" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/normal_distribution_base.h" "$(@D)/cuda/include/thrust/random/detail/normal_distribution_base.h" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/random_core_access.h" "$(@D)/cuda/include/thrust/random/detail/random_core_access.h" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/subtract_with_carry_engine.inl" "$(@D)/cuda/include/thrust/random/detail/subtract_with_carry_engine.inl" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/uniform_int_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_int_distribution.inl" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/uniform_real_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_real_distribution.inl" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/xor_combine_engine.inl" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine.inl" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/xor_combine_engine_max.h" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine_max.h" && cp "/usr/local/cuda-10.0/include/thrust/random/discard_block_engine.h" "$(@D)/cuda/include/thrust/random/discard_block_engine.h" && cp "/usr/local/cuda-10.0/include/thrust/random/linear_congruential_engine.h" "$(@D)/cuda/include/thrust/random/linear_congruential_engine.h" && cp "/usr/local/cuda-10.0/include/thrust/random/linear_feedback_shift_engine.h" "$(@D)/cuda/include/thrust/random/linear_feedback_shift_engine.h" && cp "/usr/local/cuda-10.0/include/thrust/random/normal_distribution.h" "$(@D)/cuda/include/thrust/random/normal_distribution.h" && cp "/usr/local/cuda-10.0/include/thrust/random/subtract_with_carry_engine.h" "$(@D)/cuda/include/thrust/random/subtract_with_carry_engine.h" && cp "/usr/local/cuda-10.0/include/thrust/random/uniform_int_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_int_distribution.h" && cp "/usr/local/cuda-10.0/include/thrust/random/uniform_real_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_real_distribution.h" && cp "/usr/local/cuda-10.0/include/thrust/random/xor_combine_engine.h" "$(@D)/cuda/include/thrust/random/xor_combine_engine.h" && cp "/usr/local/cuda-10.0/include/thrust/reduce.h" "$(@D)/cuda/include/thrust/reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/remove.h" "$(@D)/cuda/include/thrust/remove.h" && cp "/usr/local/cuda-10.0/include/thrust/replace.h" "$(@D)/cuda/include/thrust/replace.h" && cp "/usr/local/cuda-10.0/include/thrust/reverse.h" "$(@D)/cuda/include/thrust/reverse.h" && cp "/usr/local/cuda-10.0/include/thrust/scan.h" "$(@D)/cuda/include/thrust/scan.h" && cp "/usr/local/cuda-10.0/include/thrust/scatter.h" "$(@D)/cuda/include/thrust/scatter.h" && cp "/usr/local/cuda-10.0/include/thrust/sequence.h" "$(@D)/cuda/include/thrust/sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/set_operations.h" "$(@D)/cuda/include/thrust/set_operations.h" && cp "/usr/local/cuda-10.0/include/thrust/sort.h" "$(@D)/cuda/include/thrust/sort.h" && cp "/usr/local/cuda-10.0/include/thrust/swap.h" "$(@D)/cuda/include/thrust/swap.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cpp/detail/adjacent_difference.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/assign_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cpp/detail/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy_if.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/count.h" "$(@D)/cuda/include/thrust/system/cpp/detail/count.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/equal.h" "$(@D)/cuda/include/thrust/system/cpp/detail/equal.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cpp/detail/extrema.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/find.h" "$(@D)/cuda/include/thrust/system/cpp/detail/find.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cpp/detail/for_each.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/gather.h" "$(@D)/cuda/include/thrust/system/cpp/detail/gather.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/generate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/generate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/get_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cpp/detail/inner_product.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cpp/detail/iter_swap.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/logical.h" "$(@D)/cuda/include/thrust/system/cpp/detail/logical.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cpp/detail/malloc_and_free.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/memory.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/merge.h" "$(@D)/cuda/include/thrust/system/cpp/detail/merge.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cpp/detail/mismatch.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/par.h" "$(@D)/cuda/include/thrust/system/cpp/detail/par.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/partition.h" "$(@D)/cuda/include/thrust/system/cpp/detail/partition.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/remove.h" "$(@D)/cuda/include/thrust/system/cpp/detail/remove.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/replace.h" "$(@D)/cuda/include/thrust/system/cpp/detail/replace.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reverse.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scatter.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cpp/detail/set_operations.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/sort.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cpp/detail/swap_ranges.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/tabulate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cpp/detail/temporary_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/transform.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/unique.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/vector.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/memory.h" "$(@D)/cuda/include/thrust/system/cpp/memory.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/vector.h" "$(@D)/cuda/include/thrust/system/cpp/vector.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/config.h" "$(@D)/cuda/include/thrust/system/cuda/config.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cuda/detail/adjacent_difference.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/assign_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cuda/detail/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_if.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/agent_launcher.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/agent_launcher.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/alignment.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/alignment.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/util.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/count.h" "$(@D)/cuda/include/thrust/system/cuda/detail/count.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/cross_system.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/cub.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/cub.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_select.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/host/mutex.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_allocator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_arch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_debug.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_device.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_device.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_macro.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_namespace.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_ptx.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_type.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_type.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/equal.h" "$(@D)/cuda/include/thrust/system/cuda/detail/equal.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/error.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/error.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cuda/detail/extrema.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/find.h" "$(@D)/cuda/include/thrust/system/cuda/detail/find.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cuda/detail/for_each.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/gather.h" "$(@D)/cuda/include/thrust/system/cuda/detail/gather.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/generate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/generate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/get_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/guarded_driver_types.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_driver_types.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cuda/detail/inner_product.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/internal/copy_cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cuda/detail/iter_swap.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/logical.h" "$(@D)/cuda/include/thrust/system/cuda/detail/logical.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cuda/detail/malloc_and_free.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/memory.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/memory_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/memory_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/merge.h" "$(@D)/cuda/include/thrust/system/cuda/detail/merge.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/mismatch.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/par.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/par_to_seq.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par_to_seq.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/parallel_for.h" "$(@D)/cuda/include/thrust/system/cuda/detail/parallel_for.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/partition.h" "$(@D)/cuda/include/thrust/system/cuda/detail/partition.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/remove.h" "$(@D)/cuda/include/thrust/system/cuda/detail/remove.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/replace.h" "$(@D)/cuda/include/thrust/system/cuda/detail/replace.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reverse.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scatter.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cuda/detail/set_operations.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cuda/detail/swap_ranges.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/tabulate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/temporary_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/terminate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/terminate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/transform.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/unique.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/util.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/vector.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/error.h" "$(@D)/cuda/include/thrust/system/cuda/error.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/experimental/pinned_allocator.h" "$(@D)/cuda/include/thrust/system/cuda/experimental/pinned_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/memory.h" "$(@D)/cuda/include/thrust/system/cuda/memory.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/vector.h" "$(@D)/cuda/include/thrust/system/cuda/vector.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/adl/adjacent_difference.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/assign_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/adl/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy_if.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/count.h" "$(@D)/cuda/include/thrust/system/detail/adl/count.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/equal.h" "$(@D)/cuda/include/thrust/system/detail/adl/equal.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/extrema.h" "$(@D)/cuda/include/thrust/system/detail/adl/extrema.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/find.h" "$(@D)/cuda/include/thrust/system/detail/adl/find.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/for_each.h" "$(@D)/cuda/include/thrust/system/detail/adl/for_each.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/gather.h" "$(@D)/cuda/include/thrust/system/detail/adl/gather.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/generate.h" "$(@D)/cuda/include/thrust/system/detail/adl/generate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/get_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/get_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/adl/inner_product.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/adl/iter_swap.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/logical.h" "$(@D)/cuda/include/thrust/system/detail/adl/logical.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/adl/malloc_and_free.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/merge.h" "$(@D)/cuda/include/thrust/system/detail/adl/merge.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/adl/mismatch.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/partition.h" "$(@D)/cuda/include/thrust/system/detail/adl/partition.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/remove.h" "$(@D)/cuda/include/thrust/system/detail/adl/remove.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/replace.h" "$(@D)/cuda/include/thrust/system/detail/adl/replace.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/reverse.h" "$(@D)/cuda/include/thrust/system/detail/adl/reverse.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/scatter.h" "$(@D)/cuda/include/thrust/system/detail/adl/scatter.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/sequence.h" "$(@D)/cuda/include/thrust/system/detail/adl/sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/adl/set_operations.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/sort.h" "$(@D)/cuda/include/thrust/system/detail/adl/sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/adl/swap_ranges.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/adl/tabulate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/adl/temporary_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/transform.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/unique.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/bad_alloc.h" "$(@D)/cuda/include/thrust/system/detail/bad_alloc.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/errno.h" "$(@D)/cuda/include/thrust/system/detail/errno.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/error_category.inl" "$(@D)/cuda/include/thrust/system/detail/error_category.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/error_code.inl" "$(@D)/cuda/include/thrust/system/detail/error_code.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/error_condition.inl" "$(@D)/cuda/include/thrust/system/detail/error_condition.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/adjacent_difference.inl" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/advance.h" "$(@D)/cuda/include/thrust/system/detail/generic/advance.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/advance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/advance.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy_if.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/count.h" "$(@D)/cuda/include/thrust/system/detail/generic/count.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/count.inl" "$(@D)/cuda/include/thrust/system/detail/generic/count.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/distance.h" "$(@D)/cuda/include/thrust/system/detail/generic/distance.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/distance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/distance.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/equal.h" "$(@D)/cuda/include/thrust/system/detail/generic/equal.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/equal.inl" "$(@D)/cuda/include/thrust/system/detail/generic/equal.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/extrema.h" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/extrema.inl" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/find.h" "$(@D)/cuda/include/thrust/system/detail/generic/find.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/find.inl" "$(@D)/cuda/include/thrust/system/detail/generic/find.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/for_each.h" "$(@D)/cuda/include/thrust/system/detail/generic/for_each.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/gather.h" "$(@D)/cuda/include/thrust/system/detail/generic/gather.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/gather.inl" "$(@D)/cuda/include/thrust/system/detail/generic/gather.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/generate.h" "$(@D)/cuda/include/thrust/system/detail/generic/generate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/generate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/generate.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/inner_product.inl" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/logical.h" "$(@D)/cuda/include/thrust/system/detail/generic/logical.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/memory.h" "$(@D)/cuda/include/thrust/system/detail/generic/memory.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/memory.inl" "$(@D)/cuda/include/thrust/system/detail/generic/memory.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/merge.h" "$(@D)/cuda/include/thrust/system/detail/generic/merge.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/merge.inl" "$(@D)/cuda/include/thrust/system/detail/generic/merge.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/mismatch.inl" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/partition.h" "$(@D)/cuda/include/thrust/system/detail/generic/partition.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/partition.inl" "$(@D)/cuda/include/thrust/system/detail/generic/partition.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/remove.h" "$(@D)/cuda/include/thrust/system/detail/generic/remove.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/remove.inl" "$(@D)/cuda/include/thrust/system/detail/generic/remove.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/replace.h" "$(@D)/cuda/include/thrust/system/detail/generic/replace.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/replace.inl" "$(@D)/cuda/include/thrust/system/detail/generic/replace.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reverse.h" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reverse.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scalar/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scalar/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scatter.h" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scatter.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/select_system.h" "$(@D)/cuda/include/thrust/system/detail/generic/select_system.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sequence.h" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sequence.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/set_operations.inl" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sort.h" "$(@D)/cuda/include/thrust/system/detail/generic/sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sort.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sort.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/swap_ranges.inl" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/tabulate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/tag.h" "$(@D)/cuda/include/thrust/system/detail/generic/tag.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/temporary_buffer.inl" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/type_traits.h" "$(@D)/cuda/include/thrust/system/detail/generic/type_traits.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/internal/decompose.h" "$(@D)/cuda/include/thrust/system/detail/internal/decompose.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/sequential/adjacent_difference.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/assign_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/sequential/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy_backward.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_backward.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_if.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/count.h" "$(@D)/cuda/include/thrust/system/detail/sequential/count.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/equal.h" "$(@D)/cuda/include/thrust/system/detail/sequential/equal.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/execution_policy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/extrema.h" "$(@D)/cuda/include/thrust/system/detail/sequential/extrema.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/find.h" "$(@D)/cuda/include/thrust/system/detail/sequential/find.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/for_each.h" "$(@D)/cuda/include/thrust/system/detail/sequential/for_each.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/gather.h" "$(@D)/cuda/include/thrust/system/detail/sequential/gather.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/general_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/general_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/generate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/generate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/get_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/get_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/sequential/inner_product.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/insertion_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/insertion_sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/sequential/iter_swap.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/logical.h" "$(@D)/cuda/include/thrust/system/detail/sequential/logical.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/sequential/malloc_and_free.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/merge.h" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/merge.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/sequential/mismatch.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/partition.h" "$(@D)/cuda/include/thrust/system/detail/sequential/partition.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/remove.h" "$(@D)/cuda/include/thrust/system/detail/sequential/remove.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/replace.h" "$(@D)/cuda/include/thrust/system/detail/sequential/replace.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/reverse.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reverse.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/scatter.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scatter.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/sequence.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/sequential/set_operations.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_merge_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_merge_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_primitive_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_primitive_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_radix_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_radix_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/sequential/swap_ranges.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/tabulate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/sequential/temporary_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/transform.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/trivial_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/trivial_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/unique.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/system_error.inl" "$(@D)/cuda/include/thrust/system/detail/system_error.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/error_code.h" "$(@D)/cuda/include/thrust/system/error_code.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/omp/detail/adjacent_difference.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/assign_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/omp/detail/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/count.h" "$(@D)/cuda/include/thrust/system/omp/detail/count.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/default_decomposition.h" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/default_decomposition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/equal.h" "$(@D)/cuda/include/thrust/system/omp/detail/equal.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/detail/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/omp/detail/extrema.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/find.h" "$(@D)/cuda/include/thrust/system/omp/detail/find.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/gather.h" "$(@D)/cuda/include/thrust/system/omp/detail/gather.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/generate.h" "$(@D)/cuda/include/thrust/system/omp/detail/generate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/get_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/omp/detail/inner_product.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/omp/detail/iter_swap.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/logical.h" "$(@D)/cuda/include/thrust/system/omp/detail/logical.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/omp/detail/malloc_and_free.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/omp/detail/memory.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/merge.h" "$(@D)/cuda/include/thrust/system/omp/detail/merge.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/omp/detail/mismatch.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/par.h" "$(@D)/cuda/include/thrust/system/omp/detail/par.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/partition.h" "$(@D)/cuda/include/thrust/system/omp/detail/partition.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/partition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/partition.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_intervals.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/remove.h" "$(@D)/cuda/include/thrust/system/omp/detail/remove.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/remove.inl" "$(@D)/cuda/include/thrust/system/omp/detail/remove.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/replace.h" "$(@D)/cuda/include/thrust/system/omp/detail/replace.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/omp/detail/reverse.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/omp/detail/scatter.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/omp/detail/sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/omp/detail/set_operations.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/sort.h" "$(@D)/cuda/include/thrust/system/omp/detail/sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/sort.inl" "$(@D)/cuda/include/thrust/system/omp/detail/sort.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/omp/detail/swap_ranges.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/omp/detail/tabulate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/omp/detail/temporary_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/transform.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/omp/detail/vector.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/memory.h" "$(@D)/cuda/include/thrust/system/omp/memory.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/vector.h" "$(@D)/cuda/include/thrust/system/omp/vector.h" && cp "/usr/local/cuda-10.0/include/thrust/system/system_error.h" "$(@D)/cuda/include/thrust/system/system_error.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/tbb/detail/adjacent_difference.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/assign_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/tbb/detail/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/count.h" "$(@D)/cuda/include/thrust/system/tbb/detail/count.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/equal.h" "$(@D)/cuda/include/thrust/system/tbb/detail/equal.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/extrema.h" "$(@D)/cuda/include/thrust/system/tbb/detail/extrema.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/find.h" "$(@D)/cuda/include/thrust/system/tbb/detail/find.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/for_each.h" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/gather.h" "$(@D)/cuda/include/thrust/system/tbb/detail/gather.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/generate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/generate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/get_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/get_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/tbb/detail/inner_product.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/tbb/detail/iter_swap.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/logical.h" "$(@D)/cuda/include/thrust/system/tbb/detail/logical.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/tbb/detail/malloc_and_free.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/memory.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/memory.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/merge.h" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/merge.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/tbb/detail/mismatch.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/par.h" "$(@D)/cuda/include/thrust/system/tbb/detail/par.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/partition.h" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/partition.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_intervals.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/remove.h" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/remove.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/replace.h" "$(@D)/cuda/include/thrust/system/tbb/detail/replace.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reverse.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reverse.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scan.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scatter.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scatter.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/sequence.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/tbb/detail/set_operations.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/sort.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/sort.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/tbb/detail/swap_ranges.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/tabulate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/tbb/detail/temporary_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/transform.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/vector.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/vector.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/memory.h" "$(@D)/cuda/include/thrust/system/tbb/memory.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/vector.h" "$(@D)/cuda/include/thrust/system/tbb/vector.h" && cp "/usr/local/cuda-10.0/include/thrust/system_error.h" "$(@D)/cuda/include/thrust/system_error.h" && cp "/usr/local/cuda-10.0/include/thrust/tabulate.h" "$(@D)/cuda/include/thrust/tabulate.h" && cp "/usr/local/cuda-10.0/include/thrust/transform.h" "$(@D)/cuda/include/thrust/transform.h" && cp "/usr/local/cuda-10.0/include/thrust/transform_reduce.h" "$(@D)/cuda/include/thrust/transform_reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/transform_scan.h" "$(@D)/cuda/include/thrust/transform_scan.h" && cp "/usr/local/cuda-10.0/include/thrust/tuple.h" "$(@D)/cuda/include/thrust/tuple.h" && cp "/usr/local/cuda-10.0/include/thrust/uninitialized_copy.h" "$(@D)/cuda/include/thrust/uninitialized_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/uninitialized_fill.h" "$(@D)/cuda/include/thrust/uninitialized_fill.h" && cp "/usr/local/cuda-10.0/include/thrust/unique.h" "$(@D)/cuda/include/thrust/unique.h" && cp "/usr/local/cuda-10.0/include/thrust/version.h" "$(@D)/cuda/include/thrust/version.h" && cp "/usr/local/cuda-10.0/include/vector_functions.h" "$(@D)/cuda/include/vector_functions.h" && cp "/usr/local/cuda-10.0/include/vector_functions.hpp" "$(@D)/cuda/include/vector_functions.hpp" && cp "/usr/local/cuda-10.0/include/vector_types.h" "$(@D)/cuda/include/vector_types.h"
    """,
 )
 
@@ -1203,7 +1203,7 @@ genrule(
         "cuda/nvvm/libdevice/libdevice.10.bc",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/nvvm/libdevice/libdevice.10.bc" "$(@D)//libdevice.10.bc"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-10.0/nvvm/libdevice/libdevice.10.bc" "$(@D)//libdevice.10.bc"
    """,
 )
 
@@ -1240,7 +1240,7 @@ genrule(
         "cuda/extras/CUPTI/include/openacc/cupti_openacc.h",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/gl.h" "$(@D)/cuda/extras/CUPTI/include/GL/gl.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glew.h" "$(@D)/cuda/extras/CUPTI/include/GL/glew.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glext.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glu.h" "$(@D)/cuda/extras/CUPTI/include/GL/glu.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glut.h" "$(@D)/cuda/extras/CUPTI/include/GL/glut.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glx.h" "$(@D)/cuda/extras/CUPTI/include/GL/glx.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glxext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glxext.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglew.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglew.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglext.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglext.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cuda_stdint.h" "$(@D)/cuda/extras/CUPTI/include/cuda_stdint.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti.h" "$(@D)/cuda/extras/CUPTI/include/cupti.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_activity.h" "$(@D)/cuda/extras/CUPTI/include/cupti_activity.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_callbacks.h" "$(@D)/cuda/extras/CUPTI/include/cupti_callbacks.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_driver_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_driver_cbid.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_events.h" "$(@D)/cuda/extras/CUPTI/include/cupti_events.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_metrics.h" "$(@D)/cuda/extras/CUPTI/include/cupti_metrics.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_nvtx_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_nvtx_cbid.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_result.h" "$(@D)/cuda/extras/CUPTI/include/cupti_result.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_runtime_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_runtime_cbid.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_version.h" "$(@D)/cuda/extras/CUPTI/include/cupti_version.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaGL_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaGL_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaVDPAU_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_nvtx_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_nvtx_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/openacc/cupti_openacc.h" "$(@D)/cuda/extras/CUPTI/include/openacc/cupti_openacc.h"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/gl.h" "$(@D)/cuda/extras/CUPTI/include/GL/gl.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glew.h" "$(@D)/cuda/extras/CUPTI/include/GL/glew.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glext.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glu.h" "$(@D)/cuda/extras/CUPTI/include/GL/glu.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glut.h" "$(@D)/cuda/extras/CUPTI/include/GL/glut.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glx.h" "$(@D)/cuda/extras/CUPTI/include/GL/glx.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glxext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glxext.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/wglew.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglew.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/wglext.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglext.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cuda_stdint.h" "$(@D)/cuda/extras/CUPTI/include/cuda_stdint.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti.h" "$(@D)/cuda/extras/CUPTI/include/cupti.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_activity.h" "$(@D)/cuda/extras/CUPTI/include/cupti_activity.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_callbacks.h" "$(@D)/cuda/extras/CUPTI/include/cupti_callbacks.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_driver_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_driver_cbid.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_events.h" "$(@D)/cuda/extras/CUPTI/include/cupti_events.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_metrics.h" "$(@D)/cuda/extras/CUPTI/include/cupti_metrics.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_nvtx_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_nvtx_cbid.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_result.h" "$(@D)/cuda/extras/CUPTI/include/cupti_result.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_runtime_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_runtime_cbid.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_version.h" "$(@D)/cuda/extras/CUPTI/include/cupti_version.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cudaGL_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaGL_meta.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cudaVDPAU_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_meta.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/generated_nvtx_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_nvtx_meta.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/openacc/cupti_openacc.h" "$(@D)/cuda/extras/CUPTI/include/openacc/cupti_openacc.h"
    """,
 )
 
@@ -1248,17 +1248,17 @@ genrule(
     name = "cuda-lib",
     outs = [
         "cuda/lib/libcuda.so",
-        "cuda/lib/libcudart.so.9.0",
+        "cuda/lib/libcudart.so.10.0",
         "cuda/lib/libcudart_static.a",
-        "cuda/lib/libcublas.so.9.0",
-        "cuda/lib/libcusolver.so.9.0",
-        "cuda/lib/libcurand.so.9.0",
-        "cuda/lib/libcufft.so.9.0",
+        "cuda/lib/libcublas.so.10.0",
+        "cuda/lib/libcusolver.so.10.0",
+        "cuda/lib/libcurand.so.10.0",
+        "cuda/lib/libcufft.so.10.0",
         "cuda/lib/libcudnn.so.7",
-        "cuda/lib/libcupti.so.9.0",
+        "cuda/lib/libcupti.so.10.0",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart.so.9.0.176" "$(@D)/cuda/lib/libcudart.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcublas.so.9.0.480" "$(@D)/cuda/lib/libcublas.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcusolver.so.9.0.176" "$(@D)/cuda/lib/libcusolver.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcurand.so.9.0.176" "$(@D)/cuda/lib/libcurand.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcufft.so.9.0.176" "$(@D)/cuda/lib/libcufft.so.9.0" && cp "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.2.1" "$(@D)/cuda/lib/libcudnn.so.7" && cp "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.9.0.176" "$(@D)/cuda/lib/libcupti.so.9.0"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudart.so.10.0.176" "$(@D)/cuda/lib/libcudart.so.10.0" && cp "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcublas.so.10.0.480" "$(@D)/cuda/lib/libcublas.so.10.0" && cp "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcusolver.so.10.0.176" "$(@D)/cuda/lib/libcusolver.so.10.0" && cp "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcurand.so.10.0.176" "$(@D)/cuda/lib/libcurand.so.10.0" && cp "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcufft.so.10.0.176" "$(@D)/cuda/lib/libcufft.so.10.0" && cp "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.2.1" "$(@D)/cuda/lib/libcudnn.so.7" && cp "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.10.0.176" "$(@D)/cuda/lib/libcupti.so.10.0"
    """,
 )
 
diff --git a/third_party/toolchains/gpus/cuda/cuda/cuda_config.h b/third_party/toolchains/gpus/cuda/cuda/cuda_config.h
index 7cdaf144ada77c93119f7412df93e8f3423872ee..b05bfb732651360581d2ef9d353f16b6f9e2d9a6 100644
--- a/third_party/toolchains/gpus/cuda/cuda/cuda_config.h
+++ b/third_party/toolchains/gpus/cuda/cuda/cuda_config.h
@@ -19,9 +19,9 @@ limitations under the License.
 
 #define TF_CUDA_CAPABILITIES CudaVersion("3.0")
 
-#define TF_CUDA_VERSION "9.0"
+#define TF_CUDA_VERSION "10.0"
 #define TF_CUDNN_VERSION "7"
 
-#define TF_CUDA_TOOLKIT_PATH "/usr/local/cuda-9.0"
+#define TF_CUDA_TOOLKIT_PATH "/usr/local/cuda-10.0"
 
 #endif  // CUDA_CUDA_CONFIG_H_
diff --git a/third_party/toolchains/preconfig/generate/BUILD b/third_party/toolchains/preconfig/generate/BUILD
index b4c98dc94de7a0368efbce712e8a3b48c49f7841..ad79255251dd01966dfa2d5aa63bcc2e343dde17 100644
--- a/third_party/toolchains/preconfig/generate/BUILD
+++ b/third_party/toolchains/preconfig/generate/BUILD
@@ -2,6 +2,12 @@ licenses(["restricted"])
 
 load(":generate.bzl", "tensorflow_rbe_config")
 
+tensorflow_rbe_config(
+    name = "ubuntu16.04-py3-clang",
+    compiler = "clang",
+    python_version = "3",
+)
+
 tensorflow_rbe_config(
     name = "ubuntu14.04-py3-gcc-cuda9.0-cudnn7-tensorrt5",
     compiler = "gcc",
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index c56c6f3346ac64d516fa08f02ba9a206571a35e3..428208523b9d59f0537b237a89cc6d811220ebad 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -1,4 +1,5 @@
 container_digests = {
-    "cuda9.0-cudnn7-ubuntu14.04": "sha256:c43ed5341dd765042e0bbd1bf50fadeedd649d1e0c34d81999cb6ce30916cb95",
-    "cuda10.0-cudnn7-ubuntu14.04": "sha256:919e75247743ae1244d5d72ee9f18090379d4a9035e5853010f6d59d87cd2e8b",
+    "ubuntu16.04": "sha256:d0d98c53111c3ec071aa81632a2b0d6f210e5c2411c5172e31f99002125ec4de",
+    "cuda9.0-cudnn7-ubuntu14.04": "sha256:006a76ee1838122ff7f21ebac85f24c1ef350d4dd79b3ceff0e4fe649ed90d33",
+    "cuda10.0-cudnn7-ubuntu14.04": "sha256:e36f05f1ff39e39ddf07122e37f2b1895948bb6f7acc3db37a3c496be5e66228",
 }
diff --git a/third_party/toolchains/preconfig/generate/generate.bzl b/third_party/toolchains/preconfig/generate/generate.bzl
index 75deea41b819d0deaf35af71587322f41ff095c0..fc485d43d2456a7db9c1e737627482c5115eb70d 100644
--- a/third_party/toolchains/preconfig/generate/generate.bzl
+++ b/third_party/toolchains/preconfig/generate/generate.bzl
@@ -3,30 +3,38 @@ load(
     "docker_toolchain_autoconfig",
 )
 
-def _tensorflow_rbe_config(name, cuda_version, cudnn_version, python_version, compiler, tensorrt_version):
-    docker_toolchain_autoconfig(
-        name = name,
-        base = "@cuda%s-cudnn%s-ubuntu14.04//image" % (cuda_version, cudnn_version),
-        bazel_version = "0.19.2",
+def _tensorflow_rbe_config(name, compiler, python_version, cuda_version = None, cudnn_version = None, tensorrt_version = None):
+    base = "@ubuntu16.04//image"
+    config_repos = [
+        "local_config_python",
+        "local_config_cc",
+    ]
+    env = {
+        "ABI_VERSION": "gcc",
+        "ABI_LIBC_VERSION": "glibc_2.19",
+        "BAZEL_COMPILER": compiler,
+        "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
+        "BAZEL_TARGET_LIBC": "glibc_2.19",
+        "BAZEL_TARGET_CPU": "k8",
+        "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
+        "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
+        "CC": compiler,
+        "PYTHON_BIN_PATH": "/usr/bin/python%s" % python_version,
+        "CLEAR_CACHE": "1",
+    }
+
+    if cuda_version != None:
+        base = "@cuda%s-cudnn%s-ubuntu14.04//image" % (cuda_version, cudnn_version)
+        # The cuda toolchain currently contains its own C++ toolchain definition,
+        # so we do not fetch local_config_cc.
         config_repos = [
-            "local_config_cuda",
             "local_config_python",
+            "local_config_cuda",
             "local_config_tensorrt",
-        ],
-        env = {
-            "ABI_VERSION": "gcc",
-            "ABI_LIBC_VERSION": "glibc_2.19",
-            "BAZEL_COMPILER": compiler,
-            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
-            "BAZEL_TARGET_LIBC": "glibc_2.19",
-            "BAZEL_TARGET_CPU": "k8",
-            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
-            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
-            "CC": compiler,
-            "PYTHON_BIN_PATH": "/usr/bin/python%s" % python_version,
+        ]
+        env.update({
             "TF_NEED_CUDA": "1",
             "TF_CUDA_CLANG": "1" if compiler == "clang" else "0",
-            "CLEAR_CACHE": "1",
             "TF_CUDA_COMPUTE_CAPABILITIES": "3.0",
             "TF_ENABLE_XLA": "1",
             "TF_CUDNN_VERSION": cudnn_version,
@@ -35,7 +43,14 @@ def _tensorflow_rbe_config(name, cuda_version, cudnn_version, python_version, co
             "TF_NEED_TENSORRT" : "1",
             "TF_TENSORRT_VERSION": tensorrt_version,
             "TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
-        },
+        })
+
+    docker_toolchain_autoconfig(
+        name = name,
+        base = base,
+        bazel_version = "0.21.0",
+        config_repos = config_repos,
+        env = env,
         mount_project = "$(mount_project)",
         tags = ["manual"],
         incompatible_changes_off = True,
diff --git a/third_party/toolchains/preconfig/generate/generate.sh b/third_party/toolchains/preconfig/generate/generate.sh
index 8e3a1e6ada33fbc0969409ecda2394e2f67e8bc9..c05a4de6fb62f897b942cc13910b3abb26743063 100755
--- a/third_party/toolchains/preconfig/generate/generate.sh
+++ b/third_party/toolchains/preconfig/generate/generate.sh
@@ -37,8 +37,16 @@ TENSORRT_VERSION="${PLATFORM[5]}"
 
 # TODO(klimek): Put this into the name.
 
-if [[ "${COMPILER}" == "gcc" ]]; then
-  COMPILER="gcc-nvcc-${CUDA_VERSION}"
+if [[ -n "${CUDA_VERSION}" ]]; then
+  if [[ "${COMPILER}" == "gcc" ]]; then
+    COMPILER="gcc-nvcc-${CUDA_VERSION}"
+  fi
+  # Currently we create a special toolchain for clang when compiling with
+  # cuda enabled. We can get rid of this once the default toolchain bazel
+  # provides supports cuda.
+  if [[ "${COMPILER}" == "clang" ]]; then
+    COMPILER="cuda-clang"
+  fi
 fi
 
 echo "OS: ${OS}"
@@ -52,6 +60,8 @@ bazel build --define=mount_project="${PWD}" "${PKG}/generate:${TARGET}"
 cd "${TEMPDIR}"
 tar xvf "${ROOT}/bazel-bin/${PKG}/generate/${TARGET}_outputs.tar"
 
+# TODO(klimek): The skylark config rules should copy the files instead of
+# creating aliases.
 # Other than in @local_config_tensorrt, the header files in the remote config
 # repo are not relative to the repository root. Add a dummy include_prefix to
 # make them available as virtual includes.
@@ -74,14 +84,19 @@ mkdir "${OS}"
 # Python:
 mv local_config_python "${OS}/${PY_VERSION}"
 
-# Compiler:
-mv local_config_cuda/crosstool "${OS}/${COMPILER}"
+if [[ -n "${CUDA_VERSION}" ]]; then
+  # Compiler:
+  mv local_config_cuda/crosstool "${OS}/${COMPILER}"
 
-# CUDA:
-mv local_config_cuda "${OS}/${CUDA_VERSION}-${CUDNN_VERSION}"
+  # CUDA:
+  mv local_config_cuda "${OS}/${CUDA_VERSION}-${CUDNN_VERSION}"
 
-# TensorRT:
-mv local_config_tensorrt "${OS}/${TENSORRT_VERSION}"
+  # TensorRT:
+  mv local_config_tensorrt "${OS}/${TENSORRT_VERSION}"
+else
+  # Compiler:
+  mv local_config_cc "${OS}/${COMPILER}"
+fi
 
 # Cleanup for copybara.
 find "${OS}" -name 'BUILD' -o -name '*.bzl' |xargs buildifier
diff --git a/third_party/toolchains/preconfig/generate/workspace.bzl b/third_party/toolchains/preconfig/generate/workspace.bzl
index f30c2f1ae6318c645e174617a74b8fdadac1598e..0495173786328367b1a74d00653da58f759d963c 100644
--- a/third_party/toolchains/preconfig/generate/workspace.bzl
+++ b/third_party/toolchains/preconfig/generate/workspace.bzl
@@ -8,17 +8,24 @@ load(":containers.bzl", "container_digests")
 def _remote_config_workspace():
     container_repositories()
 
+    container_pull(
+        name = "ubuntu16.04",
+        registry = "gcr.io",
+        repository = "tensorflow-testing/nosla-ubuntu16.04",
+        digest = container_digests["ubuntu16.04"],
+    )
+
     container_pull(
         name = "cuda9.0-cudnn7-ubuntu14.04",
         registry = "gcr.io",
-        repository = "asci-toolchain/nosla-cuda9.0-cudnn7-ubuntu14.04",
+        repository = "tensorflow-testing/nosla-cuda9.0-cudnn7-ubuntu14.04",
         digest = container_digests["cuda9.0-cudnn7-ubuntu14.04"],
     )
 
     container_pull(
         name = "cuda10.0-cudnn7-ubuntu14.04",
         registry = "gcr.io",
-        repository = "asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04",
+        repository = "tensorflow-testing/nosla-cuda10.0-cudnn7-ubuntu14.04",
         digest = container_digests["cuda10.0-cudnn7-ubuntu14.04"],
     )
 
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
index 00483951af966e0085e6f2b1d74290d9ee872963..426b9ca86746c3ef92299435d7de4e6191e4b664 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
@@ -103,8 +103,9 @@ def InvokeNvcc(argv, log=False):
     The return value of calling os.system('nvcc ' + args)
   """
 
-  src_files = [f for f in argv if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  src_files = [
+      f for f in argv if re.search(r'\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)
+  ]
   if len(src_files) == 0:
     raise Error('No source files found for cuda compilation.')
 
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.py
index 859b3196d5dba9afadeae56f34be04247b00fe09..b0b4a53a805cba4e1be3b6b5438ca725a3599e78 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.py
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.py
@@ -103,8 +103,9 @@ def InvokeNvcc(argv, log=False):
     The return value of calling os.system('nvcc ' + args)
   """
 
-  src_files = [f for f in argv if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  src_files = [
+      f for f in argv if re.search(r'\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)
+  ]
   if len(src_files) == 0:
     raise Error('No source files found for cuda compilation.')
 
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/clang/bin/crosstool_wrapper_driver_is_not_gcc
index 63893d3722f6b43579758e5f747076b1f1e73ed7..192314137d4f5ca178e350894550132d045d7a2b 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/clang/bin/crosstool_wrapper_driver_is_not_gcc
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/clang/bin/crosstool_wrapper_driver_is_not_gcc
@@ -49,9 +49,9 @@ import pipes
 CPU_COMPILER = ('/usr/bin/gcc')
 GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
 
-NVCC_PATH = '/usr/local/cuda-9.0/bin/nvcc'
+NVCC_PATH = '/usr/local/cuda/bin/nvcc'
 PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
-NVCC_VERSION = '9.0'
+NVCC_VERSION = '10.0'
 
 def Log(s):
   print('gpus/crosstool: {0}'.format(s))
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/windows/msvc_wrapper_for_nvcc.py
index 859b3196d5dba9afadeae56f34be04247b00fe09..b0b4a53a805cba4e1be3b6b5438ca725a3599e78 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/windows/msvc_wrapper_for_nvcc.py
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/windows/msvc_wrapper_for_nvcc.py
@@ -103,8 +103,9 @@ def InvokeNvcc(argv, log=False):
     The return value of calling os.system('nvcc ' + args)
   """
 
-  src_files = [f for f in argv if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  src_files = [
+      f for f in argv if re.search(r'\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)
+  ]
   if len(src_files) == 0:
     raise Error('No source files found for cuda compilation.')
 
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD b/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD
new file mode 100755
index 0000000000000000000000000000000000000000..5a0c52f66ab2224c0b021875d0447ee638e833c4
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD
@@ -0,0 +1,111 @@
+# Copyright 2016 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This becomes the BUILD file for @local_config_cc// under non-FreeBSD unixes.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+cc_library(
+    name = "malloc",
+)
+
+cc_library(
+    name = "stl",
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "cc_wrapper",
+    srcs = ["cc_wrapper.sh"],
+)
+
+filegroup(
+    name = "compiler_deps",
+    srcs = glob(["extra_tools/**"]) + [":empty"],
+)
+
+# This is the entry point for --crosstool_top.  Toolchains are found
+# by lopping off the name of --crosstool_top and searching for
+# the "${CPU}" entry in the toolchains attribute.
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "k8|clang": ":cc-compiler-k8",
+        "k8": ":cc-compiler-k8",
+        "armeabi-v7a|compiler": ":cc-compiler-armeabi-v7a",
+        "armeabi-v7a": ":cc-compiler-armeabi-v7a",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-k8",
+    all_files = ":compiler_deps",
+    compiler_files = ":compiler_deps",
+    cpu = "k8",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":compiler_deps",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_identifier = "linux_gnu_x86",
+)
+
+toolchain(
+    name = "cc-toolchain-k8",
+    exec_compatible_with = [
+        # TODO(katre): add autodiscovered constraints for host CPU and OS.
+    ],
+    target_compatible_with = [
+        # TODO(katre): add autodiscovered constraints for host CPU and OS.
+    ],
+    toolchain = ":cc-compiler-k8",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+# Android tooling requires a default toolchain for the armeabi-v7a cpu.
+cc_toolchain(
+    name = "cc-compiler-armeabi-v7a",
+    all_files = ":empty",
+    compiler_files = ":empty",
+    cpu = "local",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_identifier = "stub_armeabi-v7a",
+)
+
+toolchain(
+    name = "cc-toolchain-armeabi-v7a",
+    exec_compatible_with = [
+        # TODO(katre): add autodiscovered constraints for host CPU and OS.
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:arm",
+        "@bazel_tools//platforms:android",
+    ],
+    toolchain = ":cc-compiler-armabi-v7a",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/clang/CROSSTOOL b/third_party/toolchains/preconfig/ubuntu16.04/clang/CROSSTOOL
new file mode 100755
index 0000000000000000000000000000000000000000..48f82eb35d5b2268a758bb0ebb36e243663ca372
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/clang/CROSSTOOL
@@ -0,0 +1,1209 @@
+# Copyright 2016 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+major_version: "local"
+minor_version: ""
+
+# Android tooling requires a default toolchain for the armeabi-v7a cpu.
+toolchain {
+  abi_version: "armeabi-v7a"
+  abi_libc_version: "armeabi-v7a"
+  builtin_sysroot: ""
+  compiler: "compiler"
+  host_system_name: "armeabi-v7a"
+  needsPic: true
+  supports_gold_linker: false
+  supports_incremental_linker: false
+  supports_fission: false
+  supports_interface_shared_objects: false
+  supports_normalizing_ar: false
+  supports_start_end_lib: false
+  target_libc: "armeabi-v7a"
+  target_cpu: "armeabi-v7a"
+  target_system_name: "armeabi-v7a"
+  toolchain_identifier: "stub_armeabi-v7a"
+
+  tool_path { name: "ar" path: "/bin/false" }
+  tool_path { name: "compat-ld" path: "/bin/false" }
+  tool_path { name: "cpp" path: "/bin/false" }
+  tool_path { name: "dwp" path: "/bin/false" }
+  tool_path { name: "gcc" path: "/bin/false" }
+  tool_path { name: "gcov" path: "/bin/false" }
+  tool_path { name: "ld" path: "/bin/false" }
+
+  tool_path { name: "nm" path: "/bin/false" }
+  tool_path { name: "objcopy" path: "/bin/false" }
+  tool_path { name: "objdump" path: "/bin/false" }
+  tool_path { name: "strip" path: "/bin/false" }
+  linking_mode_flags { mode: DYNAMIC }
+}
+
+toolchain {
+  toolchain_identifier: "linux_gnu_x86"
+  abi_version: "gcc"
+  abi_libc_version: "glibc_2.19"
+  builtin_sysroot: ""
+  compiler: "clang"
+  host_system_name: "i686-unknown-linux-gnu"
+  needsPic: true
+  supports_gold_linker: true
+  supports_incremental_linker: false
+  supports_fission: false
+  supports_interface_shared_objects: false
+  supports_normalizing_ar: false
+  supports_start_end_lib: true
+  target_libc: "glibc_2.19"
+  target_cpu: "k8"
+  target_system_name: "x86_64-unknown-linux-gnu"
+  cxx_flag: "-std=c++0x"
+  linker_flag: "-fuse-ld=gold"
+  linker_flag: "-Wl,-no-as-needed"
+  linker_flag: "-Wl,-z,relro,-z,now"
+  linker_flag: "-B/usr/local/bin"
+  linker_flag: "-lstdc++"
+  linker_flag: "-lm"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/local/lib/clang/7.0.0/include"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
+  cxx_builtin_include_directory: "/usr/include"
+  cxx_builtin_include_directory: "/usr/include/c++/4.9"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/4.9"
+  cxx_builtin_include_directory: "/usr/include/c++/4.9/backward"
+  objcopy_embed_flag: "-I"
+  objcopy_embed_flag: "binary"
+  unfiltered_cxx_flag: "-no-canonical-prefixes"
+  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
+  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
+  compiler_flag: "-U_FORTIFY_SOURCE"
+  compiler_flag: "-fstack-protector"
+  compiler_flag: "-Wall"
+  compiler_flag: "-Wthread-safety"
+  compiler_flag: "-Wself-assign"
+  compiler_flag: "-fcolor-diagnostics"
+  compiler_flag: "-fno-omit-frame-pointer"
+  tool_path {name: "ar" path: "/usr/bin/ar" }
+  tool_path {name: "ld" path: "/usr/bin/ld" }
+  tool_path {name: "cpp" path: "/usr/bin/cpp" }
+  tool_path {name: "gcc" path: "/usr/local/bin/clang" }
+  tool_path {name: "dwp" path: "/usr/bin/dwp" }
+  tool_path {name: "gcov" path: "None" }
+  tool_path {name: "nm" path: "/usr/bin/nm" }
+  tool_path {name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path {name: "objdump" path: "/usr/bin/objdump" }
+  tool_path {name: "strip" path: "/usr/bin/strip" }
+
+  compilation_mode_flags {
+    mode: DBG
+    compiler_flag: "-g"
+  }
+  compilation_mode_flags {
+    mode: OPT
+    compiler_flag: "-g0"
+    compiler_flag: "-O2"
+    compiler_flag: "-D_FORTIFY_SOURCE=1"
+    compiler_flag: "-DNDEBUG"
+    compiler_flag: "-ffunction-sections"
+    compiler_flag: "-fdata-sections"
+    linker_flag: "-Wl,--gc-sections"
+  }
+  linking_mode_flags { mode: DYNAMIC }
+
+
+    feature {
+      name: 'coverage'
+      provides: 'profile'
+      flag_set {
+        action: 'preprocess-assemble'
+        action: 'c-compile'
+        action: 'c++-compile'
+        action: 'c++-header-parsing'
+        action: 'c++-module-compile'
+        flag_group {
+        flag: '--coverage'
+      }
+      }
+      flag_set {
+        action: 'c++-link-dynamic-library'
+        action: 'c++-link-nodeps-dynamic-library'
+        action: 'c++-link-executable'
+        flag_group {
+        flag: '--coverage'
+      }
+      }
+    }
+  
+
+  feature {
+    name: 'fdo_optimize'
+    provides: 'profile'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      expand_if_all_available: 'fdo_profile_path'
+      flag_group {
+        flag: '-fprofile-use=%{fdo_profile_path}'
+        flag: '-fprofile-correction',
+      }
+    }
+  }
+}
+
+toolchain {
+  toolchain_identifier: "msys_x64_mingw"
+  abi_version: "local"
+  abi_libc_version: "local"
+  builtin_sysroot: ""
+  compiler: "mingw-gcc"
+  host_system_name: "local"
+  needsPic: false
+  target_libc: "mingw"
+  target_cpu: "x64_windows"
+  target_system_name: "local"
+
+  artifact_name_pattern {
+     category_name: 'executable'
+     prefix: ''
+     extension: '.exe'
+  }
+
+
+
+  linking_mode_flags { mode: DYNAMIC }
+}
+
+toolchain {
+  toolchain_identifier: "msvc_x64"
+  host_system_name: "local"
+  target_system_name: "local"
+
+  abi_version: "local"
+  abi_libc_version: "local"
+  target_cpu: "x64_windows"
+  compiler: "msvc-cl"
+  target_libc: "msvcrt"
+  default_python_version: "python2.7"
+
+
+
+  tool_path {
+    name: "ar"
+    path: ""
+  }
+  tool_path {
+    name: "ml"
+    path: ""
+  }
+  tool_path {
+    name: "cpp"
+    path: ""
+  }
+  tool_path {
+    name: "gcc"
+    path: ""
+  }
+  tool_path {
+    name: "gcov"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "ld"
+    path: ""
+  }
+  tool_path {
+    name: "nm"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objcopy"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objdump"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "strip"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  supports_gold_linker: false
+  supports_start_end_lib: false
+  supports_interface_shared_objects: true
+  supports_incremental_linker: false
+  supports_normalizing_ar: true
+  needsPic: false
+
+  # TODO(pcloudy): Review those flags below, they should be defined by cl.exe
+  compiler_flag: "/DCOMPILER_MSVC"
+
+  # Don't define min/max macros in windows.h.
+  compiler_flag: "/DNOMINMAX"
+
+  # Platform defines.
+  compiler_flag: "/D_WIN32_WINNT=0x0601"
+  # Turn off warning messages.
+  compiler_flag: "/D_CRT_SECURE_NO_DEPRECATE"
+  compiler_flag: "/D_CRT_SECURE_NO_WARNINGS"
+
+  # Useful options to have on for compilation.
+  # Increase the capacity of object files to 2^32 sections.
+  compiler_flag: "/bigobj"
+  # Allocate 500MB for precomputed headers.
+  compiler_flag: "/Zm500"
+  # Catch C++ exceptions only and tell the compiler to assume that functions declared
+  # as extern "C" never throw a C++ exception.
+  compiler_flag: "/EHsc"
+
+  # Globally disabled warnings.
+  # Don't warn about elements of array being be default initialized.
+  compiler_flag: "/wd4351"
+  # Don't warn about no matching delete found.
+  compiler_flag: "/wd4291"
+  # Don't warn about diamond inheritance patterns.
+  compiler_flag: "/wd4250"
+  # Don't warn about insecure functions (e.g. non _s functions).
+  compiler_flag: "/wd4996"
+
+  linker_flag: "/MACHINE:X64"
+
+  feature {
+    name: "no_legacy_features"
+  }
+
+  artifact_name_pattern {
+     category_name: 'object_file'
+     prefix: ''
+     extension: '.obj'
+  }
+
+  artifact_name_pattern {
+     category_name: 'static_library'
+     prefix: ''
+     extension: '.lib'
+  }
+
+  artifact_name_pattern {
+     category_name: 'alwayslink_static_library'
+     prefix: ''
+     extension: '.lo.lib'
+  }
+
+  artifact_name_pattern {
+     category_name: 'executable'
+     prefix: ''
+     extension: '.exe'
+  }
+
+  artifact_name_pattern {
+     category_name: 'dynamic_library'
+     prefix: ''
+     extension: '.dll'
+  }
+
+  artifact_name_pattern {
+     category_name: 'interface_library'
+     prefix: ''
+     extension: '.if.lib'
+  }
+
+  # Suppress startup banner.
+  feature {
+    name: "nologo"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      flag_group {
+        flag: "/nologo"
+      }
+    }
+  }
+
+  feature {
+    name: 'has_configured_linker_path'
+  }
+
+  # This feature indicates strip is not supported, building stripped binary will just result a copy of orignial binary
+  feature {
+    name: 'no_stripping'
+  }
+
+  # This feature indicates this is a toolchain targeting Windows.
+  feature {
+    name: 'targets_windows'
+    implies: 'copy_dynamic_libraries_to_binary'
+    enabled: true
+  }
+
+  feature {
+    name: 'copy_dynamic_libraries_to_binary'
+  }
+
+  action_config {
+    config_name: 'assemble'
+    action_name: 'assemble'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'preprocess-assemble'
+    action_name: 'preprocess-assemble'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'c-compile'
+    action_name: 'c-compile'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-compile'
+    action_name: 'c++-compile'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-link-executable'
+    action_name: 'c++-link-executable'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+  }
+
+  action_config {
+    config_name: 'c++-link-dynamic-library'
+    action_name: 'c++-link-dynamic-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'shared_flag'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+    implies: 'has_configured_linker_path'
+    implies: 'def_file'
+  }
+
+  action_config {
+      config_name: 'c++-link-nodeps-dynamic-library'
+      action_name: 'c++-link-nodeps-dynamic-library'
+      tool {
+        tool_path: ''
+      }
+      implies: 'nologo'
+      implies: 'shared_flag'
+      implies: 'linkstamps'
+      implies: 'output_execpath_flags'
+      implies: 'input_param_flags'
+      implies: 'user_link_flags'
+      implies: 'legacy_link_flags'
+      implies: 'linker_subsystem_flag'
+      implies: 'linker_param_file'
+      implies: 'msvc_env'
+      implies: 'no_stripping'
+      implies: 'has_configured_linker_path'
+      implies: 'def_file'
+    }
+
+  action_config {
+    config_name: 'c++-link-static-library'
+    action_name: 'c++-link-static-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'archiver_flags'
+    implies: 'input_param_flags'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+  }
+
+  # TODO(b/65151735): Remove legacy_compile_flags feature when legacy fields are
+  # not used in this crosstool
+  feature {
+    name: 'legacy_compile_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'legacy_compile_flags'
+        flag: '%{legacy_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: "msvc_env"
+    env_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      env_entry {
+        key: "PATH"
+        value: ""
+      }
+      env_entry {
+        key: "TMP"
+        value: ""
+      }
+      env_entry {
+        key: "TEMP"
+        value: ""
+      }
+    }
+    implies: 'msvc_compile_env'
+    implies: 'msvc_link_env'
+  }
+
+  feature {
+    name: "msvc_compile_env"
+    env_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      env_entry {
+        key: "INCLUDE"
+        value: ""
+      }
+    }
+  }
+
+  feature {
+    name: "msvc_link_env"
+    env_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      env_entry {
+        key: "LIB"
+        value: ""
+      }
+    }
+  }
+
+  feature {
+    name: 'include_paths'
+    flag_set {
+      action: "assemble"
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      flag_group {
+        iterate_over: 'quote_include_paths'
+        flag: '/I%{quote_include_paths}'
+      }
+      flag_group {
+        iterate_over: 'include_paths'
+        flag: '/I%{include_paths}'
+      }
+      flag_group {
+        iterate_over: 'system_include_paths'
+        flag: '/I%{system_include_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: "preprocessor_defines"
+    flag_set {
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-module-compile"
+      flag_group {
+        flag: "/D%{preprocessor_defines}"
+        iterate_over: "preprocessor_defines"
+      }
+    }
+  }
+
+  # Tell Bazel to parse the output of /showIncludes
+  feature {
+    name: 'parse_showincludes'
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-module-compile'
+      action: 'c++-header-parsing'
+      flag_group {
+        flag: "/showIncludes"
+      }
+    }
+  }
+
+
+  feature {
+    name: 'generate_pdb_file'
+    requires: {
+      feature: 'dbg'
+    }
+    requires: {
+      feature: 'fastbuild'
+    }
+  }
+
+  feature {
+    name: 'shared_flag'
+    flag_set {
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/DLL'
+      }
+    }
+  }
+
+  feature {
+    name: 'linkstamps'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      expand_if_all_available: 'linkstamp_paths'
+      flag_group {
+        iterate_over: 'linkstamp_paths'
+        flag: '%{linkstamp_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: 'output_execpath_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'archiver_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'input_param_flags'
+    flag_set {
+      expand_if_all_available: 'interface_library_output_path'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/IMPLIB:%{interface_library_output_path}"
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libopts'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'libopts'
+        flag: '%{libopts}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libraries_to_link'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        iterate_over: 'libraries_to_link'
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file_group'
+          }
+          iterate_over: 'libraries_to_link.object_files'
+          flag_group {
+            flag: '%{libraries_to_link.object_files}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'interface_library'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'static_library'
+          }
+          flag_group {
+            expand_if_false: 'libraries_to_link.is_whole_archive'
+            flag: '%{libraries_to_link.name}'
+          }
+          flag_group {
+            expand_if_true: 'libraries_to_link.is_whole_archive'
+            flag: '/WHOLEARCHIVE:%{libraries_to_link.name}'
+          }
+        }
+      }
+    }
+  }
+
+  # Since this feature is declared earlier in the CROSSTOOL than
+  # "user_link_flags", this feature will be applied prior to it anwyhere they
+  # are both implied. And since "user_link_flags" contains the linkopts from
+  # the build rule, this allows the user to override the /SUBSYSTEM in the BUILD
+  # file.
+  feature {
+    name: 'linker_subsystem_flag'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/SUBSYSTEM:CONSOLE'
+      }
+    }
+  }
+
+  # The "user_link_flags" contains user-defined linkopts (from build rules)
+  # so it should be defined after features that declare user-overridable flags.
+  # For example the "linker_subsystem_flag" defines a default "/SUBSYSTEM" flag
+  # but we want to let the user override it, therefore "link_flag_subsystem" is
+  # defined earlier in the CROSSTOOL file than "user_link_flags".
+  feature {
+    name: 'user_link_flags'
+    flag_set {
+      expand_if_all_available: 'user_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'user_link_flags'
+        flag: '%{user_link_flags}'
+      }
+    }
+  }
+  feature {
+    name: 'legacy_link_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'legacy_link_flags'
+        flag: '%{legacy_link_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'linker_param_file'
+    flag_set {
+      expand_if_all_available: 'linker_param_file'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '@%{linker_param_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'static_link_msvcrt'
+  }
+
+  feature {
+    name: 'static_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MT"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MD"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'static_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MTd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MDd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dbg'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: ""
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'fastbuild'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: ""
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'opt'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/O2" # Implies /Og /Oi /Ot /Oy /Ob2 /Gs /GF /Gy
+      }
+    }
+    implies: 'frame_pointer'
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  # Must come after /O1, /O2 and /Ox.
+  feature {
+    name: "frame_pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "/Oy-"
+      }
+    }
+  }
+
+  # Remove assert/DCHECKs in opt mode.
+  # You can have them back with --features=-disable_assertions.
+  feature {
+    name: 'disable_assertions'
+    enabled: true
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      with_feature: {
+        feature: 'opt'
+      }
+      flag_group {
+        flag: "/DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    enabled: true
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        # TODO: detect clang on Windows and use "-Wno-builtin-macro-redefined"
+        flag: "/wd4117" # Trying to define or undefine a predefined macro
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  feature {
+    name: 'treat_warnings_as_errors'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/WX"
+      }
+    }
+  }
+
+  # Trade slower build time for smaller binary
+  feature {
+    name: 'smaller_binary'
+    enabled: true
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      with_feature: {
+        feature: 'opt'
+      }
+      flag_group {
+        flag: "/Gy" # Enable function-level linking (-ffunction-sections)
+        flag: "/Gw" # Optimize global data (-fdata-sections)
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library',
+      action: 'c++-link-nodeps-dynamic-library'
+      with_feature: {
+        feature: 'opt'
+      }
+      flag_group {
+        flag: '/OPT:ICF' # Fold identical functions
+        flag: '/OPT:REF' # Eliminate unreferenced functions and data
+      }
+    }
+  }
+
+  # Suppress warnings that most users do not care
+  feature {
+    name: 'ignore_noisy_warnings'
+    enabled: true
+    flag_set {
+      action: 'c++-link-static-library'
+      flag_group {
+        # Suppress 'object file does not define any public symbols' warning
+        flag: '/ignore:4221'
+      }
+    }
+  }
+
+  feature {
+    name: 'user_compile_flags'
+    flag_set {
+      expand_if_all_available: 'user_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'user_compile_flags'
+        flag: '%{user_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'sysroot'
+    flag_set {
+      expand_if_all_available: 'sysroot'
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'sysroot'
+        flag: '--sysroot=%{sysroot}'
+      }
+    }
+  }
+
+  feature {
+    name: 'unfiltered_compile_flags'
+    flag_set {
+      expand_if_all_available: 'unfiltered_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'unfiltered_compile_flags'
+        flag: '%{unfiltered_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_output_flags'
+    flag_set {
+      action: 'assemble'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+        flag: '/Zi'
+      }
+    }
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_assembly_file'
+        flag: '/Fa%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_preprocess_file'
+        flag: '/P'
+        flag: '/Fi%{output_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_input_flags'
+    flag_set {
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'source_file'
+        flag: '/c'
+        flag: '%{source_file}'
+      }
+    }
+  }
+
+  feature {
+    name : 'def_file',
+    flag_set {
+      expand_if_all_available: 'def_file_path'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEF:%{def_file_path}"
+        # We can specify a different DLL name in DEF file, /ignore:4070 suppresses
+        # the warning message about DLL name doesn't match the default one.
+        # See https://msdn.microsoft.com/en-us/library/sfkk2fz7.aspx
+        flag: "/ignore:4070"
+      }
+    }
+  }
+
+  feature {
+    name: 'windows_export_all_symbols'
+  }
+
+  feature {
+    name: 'no_windows_export_all_symbols'
+  }
+
+  linking_mode_flags { mode: DYNAMIC }
+}
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/clang/WORKSPACE b/third_party/toolchains/preconfig/ubuntu16.04/clang/WORKSPACE
new file mode 100644
index 0000000000000000000000000000000000000000..bc05b4c36ff49949e18a9c6f08b03d541149ede1
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/clang/WORKSPACE
@@ -0,0 +1,2 @@
+# DO NOT EDIT: automatically generated WORKSPACE file for cc_autoconf rule
+workspace(name = "local_config_cc")
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/clang/cc_wrapper.sh b/third_party/toolchains/preconfig/ubuntu16.04/clang/cc_wrapper.sh
new file mode 100755
index 0000000000000000000000000000000000000000..42a751dccfb0d9c7115ef5ed5483335c0e0f129b
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/clang/cc_wrapper.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+#
+# Copyright 2015 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Ship the environment to the C++ action
+#
+set -eu
+
+# Set-up the environment
+
+
+# Call the C++ compiler
+/usr/local/bin/clang "$@"
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/clang/dummy_toolchain.bzl b/third_party/toolchains/preconfig/ubuntu16.04/clang/dummy_toolchain.bzl
new file mode 100755
index 0000000000000000000000000000000000000000..45c0285d232806672e93cb6d9b860b2693e75d3d
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/clang/dummy_toolchain.bzl
@@ -0,0 +1,23 @@
+# pylint: disable=g-bad-file-header
+# Copyright 2017 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Skylark rule that stubs a toolchain."""
+
+def _dummy_toolchain_impl(ctx):
+    ctx = ctx  # unused argument
+    toolchain = platform_common.ToolchainInfo()
+    return [toolchain]
+
+dummy_toolchain = rule(_dummy_toolchain_impl, attrs = {})
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/clang/tools/cpp/empty.cc b/third_party/toolchains/preconfig/ubuntu16.04/clang/tools/cpp/empty.cc
new file mode 100755
index 0000000000000000000000000000000000000000..c272dabaeb6829b5ded592b4b37194ef3af364dd
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/clang/tools/cpp/empty.cc
@@ -0,0 +1 @@
+int main() {}
\ No newline at end of file
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/py3/BUILD b/third_party/toolchains/preconfig/ubuntu16.04/py3/BUILD
new file mode 100755
index 0000000000000000000000000000000000000000..77eaa4d5121c32f2a4d58f3bb0fb470b72c9f0f6
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/py3/BUILD
@@ -0,0 +1,205 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+# To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
+# See https://docs.python.org/3/extending/windows.html
+cc_import(
+    name = "python_lib",
+    interface_library = select({
+        ":windows": ":python_import_lib",
+        # A placeholder for Unix platforms which makes --no_build happy.
+        "//conditions:default": "not-existing.lib",
+    }),
+    system_provided = 1,
+)
+
+cc_library(
+    name = "python_headers",
+    hdrs = [":python_include"],
+    includes = ["python_include"],
+    deps = select({
+        ":windows": [":python_lib"],
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "numpy_headers",
+    hdrs = [":numpy_include"],
+    includes = ["numpy_include"],
+)
+
+config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows"},
+    visibility = ["//visibility:public"],
+)
+
+genrule(
+    name = "python_include",
+    outs = [
+        "python_include/Python-ast.h",
+        "python_include/Python.h",
+        "python_include/abstract.h",
+        "python_include/accu.h",
+        "python_include/asdl.h",
+        "python_include/ast.h",
+        "python_include/bitset.h",
+        "python_include/bltinmodule.h",
+        "python_include/boolobject.h",
+        "python_include/bytearrayobject.h",
+        "python_include/bytes_methods.h",
+        "python_include/bytesobject.h",
+        "python_include/cellobject.h",
+        "python_include/ceval.h",
+        "python_include/classobject.h",
+        "python_include/code.h",
+        "python_include/codecs.h",
+        "python_include/compile.h",
+        "python_include/complexobject.h",
+        "python_include/datetime.h",
+        "python_include/descrobject.h",
+        "python_include/dictobject.h",
+        "python_include/dtoa.h",
+        "python_include/dynamic_annotations.h",
+        "python_include/enumobject.h",
+        "python_include/errcode.h",
+        "python_include/eval.h",
+        "python_include/fileobject.h",
+        "python_include/fileutils.h",
+        "python_include/floatobject.h",
+        "python_include/frameobject.h",
+        "python_include/funcobject.h",
+        "python_include/genobject.h",
+        "python_include/graminit.h",
+        "python_include/grammar.h",
+        "python_include/import.h",
+        "python_include/intrcheck.h",
+        "python_include/iterobject.h",
+        "python_include/listobject.h",
+        "python_include/longintrepr.h",
+        "python_include/longobject.h",
+        "python_include/marshal.h",
+        "python_include/memoryobject.h",
+        "python_include/metagrammar.h",
+        "python_include/methodobject.h",
+        "python_include/modsupport.h",
+        "python_include/moduleobject.h",
+        "python_include/namespaceobject.h",
+        "python_include/node.h",
+        "python_include/numpy/__multiarray_api.h",
+        "python_include/numpy/__ufunc_api.h",
+        "python_include/numpy/_neighborhood_iterator_imp.h",
+        "python_include/numpy/_numpyconfig.h",
+        "python_include/numpy/arrayobject.h",
+        "python_include/numpy/arrayscalars.h",
+        "python_include/numpy/halffloat.h",
+        "python_include/numpy/multiarray_api.txt",
+        "python_include/numpy/ndarrayobject.h",
+        "python_include/numpy/ndarraytypes.h",
+        "python_include/numpy/noprefix.h",
+        "python_include/numpy/npy_1_7_deprecated_api.h",
+        "python_include/numpy/npy_3kcompat.h",
+        "python_include/numpy/npy_common.h",
+        "python_include/numpy/npy_cpu.h",
+        "python_include/numpy/npy_endian.h",
+        "python_include/numpy/npy_interrupt.h",
+        "python_include/numpy/npy_math.h",
+        "python_include/numpy/npy_no_deprecated_api.h",
+        "python_include/numpy/npy_os.h",
+        "python_include/numpy/numpyconfig.h",
+        "python_include/numpy/old_defines.h",
+        "python_include/numpy/oldnumeric.h",
+        "python_include/numpy/ufunc_api.txt",
+        "python_include/numpy/ufuncobject.h",
+        "python_include/numpy/utils.h",
+        "python_include/object.h",
+        "python_include/objimpl.h",
+        "python_include/odictobject.h",
+        "python_include/opcode.h",
+        "python_include/osdefs.h",
+        "python_include/parsetok.h",
+        "python_include/patchlevel.h",
+        "python_include/pgen.h",
+        "python_include/pgenheaders.h",
+        "python_include/py_curses.h",
+        "python_include/pyarena.h",
+        "python_include/pyatomic.h",
+        "python_include/pycapsule.h",
+        "python_include/pyconfig.h",
+        "python_include/pyctype.h",
+        "python_include/pydebug.h",
+        "python_include/pyerrors.h",
+        "python_include/pyexpat.h",
+        "python_include/pyfpe.h",
+        "python_include/pygetopt.h",
+        "python_include/pyhash.h",
+        "python_include/pylifecycle.h",
+        "python_include/pymacconfig.h",
+        "python_include/pymacro.h",
+        "python_include/pymath.h",
+        "python_include/pymem.h",
+        "python_include/pyport.h",
+        "python_include/pystate.h",
+        "python_include/pystrcmp.h",
+        "python_include/pystrhex.h",
+        "python_include/pystrtod.h",
+        "python_include/pythonrun.h",
+        "python_include/pythread.h",
+        "python_include/pytime.h",
+        "python_include/rangeobject.h",
+        "python_include/setobject.h",
+        "python_include/sliceobject.h",
+        "python_include/structmember.h",
+        "python_include/structseq.h",
+        "python_include/symtable.h",
+        "python_include/sysmodule.h",
+        "python_include/token.h",
+        "python_include/traceback.h",
+        "python_include/tupleobject.h",
+        "python_include/typeslots.h",
+        "python_include/ucnhash.h",
+        "python_include/unicodeobject.h",
+        "python_include/warnings.h",
+        "python_include/weakrefobject.h",
+    ],
+    cmd = """
+cp -f "/usr/include/python3.5m/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp -f "/usr/include/python3.5m/Python.h" "$(@D)/python_include/Python.h" && cp -f "/usr/include/python3.5m/abstract.h" "$(@D)/python_include/abstract.h" && cp -f "/usr/include/python3.5m/accu.h" "$(@D)/python_include/accu.h" && cp -f "/usr/include/python3.5m/asdl.h" "$(@D)/python_include/asdl.h" && cp -f "/usr/include/python3.5m/ast.h" "$(@D)/python_include/ast.h" && cp -f "/usr/include/python3.5m/bitset.h" "$(@D)/python_include/bitset.h" && cp -f "/usr/include/python3.5m/bltinmodule.h" "$(@D)/python_include/bltinmodule.h" && cp -f "/usr/include/python3.5m/boolobject.h" "$(@D)/python_include/boolobject.h" && cp -f "/usr/include/python3.5m/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp -f "/usr/include/python3.5m/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp -f "/usr/include/python3.5m/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp -f "/usr/include/python3.5m/cellobject.h" "$(@D)/python_include/cellobject.h" && cp -f "/usr/include/python3.5m/ceval.h" "$(@D)/python_include/ceval.h" && cp -f "/usr/include/python3.5m/classobject.h" "$(@D)/python_include/classobject.h" && cp -f "/usr/include/python3.5m/code.h" "$(@D)/python_include/code.h" && cp -f "/usr/include/python3.5m/codecs.h" "$(@D)/python_include/codecs.h" && cp -f "/usr/include/python3.5m/compile.h" "$(@D)/python_include/compile.h" && cp -f "/usr/include/python3.5m/complexobject.h" "$(@D)/python_include/complexobject.h" && cp -f "/usr/include/python3.5m/datetime.h" "$(@D)/python_include/datetime.h" && cp -f "/usr/include/python3.5m/descrobject.h" "$(@D)/python_include/descrobject.h" && cp -f "/usr/include/python3.5m/dictobject.h" "$(@D)/python_include/dictobject.h" && cp -f "/usr/include/python3.5m/dtoa.h" "$(@D)/python_include/dtoa.h" && cp -f "/usr/include/python3.5m/dynamic_annotations.h" "$(@D)/python_include/dynamic_annotations.h" && cp -f "/usr/include/python3.5m/enumobject.h" "$(@D)/python_include/enumobject.h" && cp -f "/usr/include/python3.5m/errcode.h" "$(@D)/python_include/errcode.h" && cp -f "/usr/include/python3.5m/eval.h" "$(@D)/python_include/eval.h" && cp -f "/usr/include/python3.5m/fileobject.h" "$(@D)/python_include/fileobject.h" && cp -f "/usr/include/python3.5m/fileutils.h" "$(@D)/python_include/fileutils.h" && cp -f "/usr/include/python3.5m/floatobject.h" "$(@D)/python_include/floatobject.h" && cp -f "/usr/include/python3.5m/frameobject.h" "$(@D)/python_include/frameobject.h" && cp -f "/usr/include/python3.5m/funcobject.h" "$(@D)/python_include/funcobject.h" && cp -f "/usr/include/python3.5m/genobject.h" "$(@D)/python_include/genobject.h" && cp -f "/usr/include/python3.5m/graminit.h" "$(@D)/python_include/graminit.h" && cp -f "/usr/include/python3.5m/grammar.h" "$(@D)/python_include/grammar.h" && cp -f "/usr/include/python3.5m/import.h" "$(@D)/python_include/import.h" && cp -f "/usr/include/python3.5m/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp -f "/usr/include/python3.5m/iterobject.h" "$(@D)/python_include/iterobject.h" && cp -f "/usr/include/python3.5m/listobject.h" "$(@D)/python_include/listobject.h" && cp -f "/usr/include/python3.5m/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp -f "/usr/include/python3.5m/longobject.h" "$(@D)/python_include/longobject.h" && cp -f "/usr/include/python3.5m/marshal.h" "$(@D)/python_include/marshal.h" && cp -f "/usr/include/python3.5m/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp -f "/usr/include/python3.5m/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp -f "/usr/include/python3.5m/methodobject.h" "$(@D)/python_include/methodobject.h" && cp -f "/usr/include/python3.5m/modsupport.h" "$(@D)/python_include/modsupport.h" && cp -f "/usr/include/python3.5m/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp -f "/usr/include/python3.5m/namespaceobject.h" "$(@D)/python_include/namespaceobject.h" && cp -f "/usr/include/python3.5m/node.h" "$(@D)/python_include/node.h" && cp -f "/usr/include/python3.5m/numpy/__multiarray_api.h" "$(@D)/python_include/numpy/__multiarray_api.h" && cp -f "/usr/include/python3.5m/numpy/__ufunc_api.h" "$(@D)/python_include/numpy/__ufunc_api.h" && cp -f "/usr/include/python3.5m/numpy/_neighborhood_iterator_imp.h" "$(@D)/python_include/numpy/_neighborhood_iterator_imp.h" && cp -f "/usr/include/python3.5m/numpy/_numpyconfig.h" "$(@D)/python_include/numpy/_numpyconfig.h" && cp -f "/usr/include/python3.5m/numpy/arrayobject.h" "$(@D)/python_include/numpy/arrayobject.h" && cp -f "/usr/include/python3.5m/numpy/arrayscalars.h" "$(@D)/python_include/numpy/arrayscalars.h" && cp -f "/usr/include/python3.5m/numpy/halffloat.h" "$(@D)/python_include/numpy/halffloat.h" && cp -f "/usr/include/python3.5m/numpy/multiarray_api.txt" "$(@D)/python_include/numpy/multiarray_api.txt" && cp -f "/usr/include/python3.5m/numpy/ndarrayobject.h" "$(@D)/python_include/numpy/ndarrayobject.h" && cp -f "/usr/include/python3.5m/numpy/ndarraytypes.h" "$(@D)/python_include/numpy/ndarraytypes.h" && cp -f "/usr/include/python3.5m/numpy/noprefix.h" "$(@D)/python_include/numpy/noprefix.h" && cp -f "/usr/include/python3.5m/numpy/npy_1_7_deprecated_api.h" "$(@D)/python_include/numpy/npy_1_7_deprecated_api.h" && cp -f "/usr/include/python3.5m/numpy/npy_3kcompat.h" "$(@D)/python_include/numpy/npy_3kcompat.h" && cp -f "/usr/include/python3.5m/numpy/npy_common.h" "$(@D)/python_include/numpy/npy_common.h" && cp -f "/usr/include/python3.5m/numpy/npy_cpu.h" "$(@D)/python_include/numpy/npy_cpu.h" && cp -f "/usr/include/python3.5m/numpy/npy_endian.h" "$(@D)/python_include/numpy/npy_endian.h" && cp -f "/usr/include/python3.5m/numpy/npy_interrupt.h" "$(@D)/python_include/numpy/npy_interrupt.h" && cp -f "/usr/include/python3.5m/numpy/npy_math.h" "$(@D)/python_include/numpy/npy_math.h" && cp -f "/usr/include/python3.5m/numpy/npy_no_deprecated_api.h" "$(@D)/python_include/numpy/npy_no_deprecated_api.h" && cp -f "/usr/include/python3.5m/numpy/npy_os.h" "$(@D)/python_include/numpy/npy_os.h" && cp -f "/usr/include/python3.5m/numpy/numpyconfig.h" "$(@D)/python_include/numpy/numpyconfig.h" && cp -f "/usr/include/python3.5m/numpy/old_defines.h" "$(@D)/python_include/numpy/old_defines.h" && cp -f "/usr/include/python3.5m/numpy/oldnumeric.h" "$(@D)/python_include/numpy/oldnumeric.h" && cp -f "/usr/include/python3.5m/numpy/ufunc_api.txt" "$(@D)/python_include/numpy/ufunc_api.txt" && cp -f "/usr/include/python3.5m/numpy/ufuncobject.h" "$(@D)/python_include/numpy/ufuncobject.h" && cp -f "/usr/include/python3.5m/numpy/utils.h" "$(@D)/python_include/numpy/utils.h" && cp -f "/usr/include/python3.5m/object.h" "$(@D)/python_include/object.h" && cp -f "/usr/include/python3.5m/objimpl.h" "$(@D)/python_include/objimpl.h" && cp -f "/usr/include/python3.5m/odictobject.h" "$(@D)/python_include/odictobject.h" && cp -f "/usr/include/python3.5m/opcode.h" "$(@D)/python_include/opcode.h" && cp -f "/usr/include/python3.5m/osdefs.h" "$(@D)/python_include/osdefs.h" && cp -f "/usr/include/python3.5m/parsetok.h" "$(@D)/python_include/parsetok.h" && cp -f "/usr/include/python3.5m/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp -f "/usr/include/python3.5m/pgen.h" "$(@D)/python_include/pgen.h" && cp -f "/usr/include/python3.5m/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp -f "/usr/include/python3.5m/py_curses.h" "$(@D)/python_include/py_curses.h" && cp -f "/usr/include/python3.5m/pyarena.h" "$(@D)/python_include/pyarena.h" && cp -f "/usr/include/python3.5m/pyatomic.h" "$(@D)/python_include/pyatomic.h" && cp -f "/usr/include/python3.5m/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp -f "/usr/include/python3.5m/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp -f "/usr/include/python3.5m/pyctype.h" "$(@D)/python_include/pyctype.h" && cp -f "/usr/include/python3.5m/pydebug.h" "$(@D)/python_include/pydebug.h" && cp -f "/usr/include/python3.5m/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp -f "/usr/include/python3.5m/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp -f "/usr/include/python3.5m/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp -f "/usr/include/python3.5m/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp -f "/usr/include/python3.5m/pyhash.h" "$(@D)/python_include/pyhash.h" && cp -f "/usr/include/python3.5m/pylifecycle.h" "$(@D)/python_include/pylifecycle.h" && cp -f "/usr/include/python3.5m/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp -f "/usr/include/python3.5m/pymacro.h" "$(@D)/python_include/pymacro.h" && cp -f "/usr/include/python3.5m/pymath.h" "$(@D)/python_include/pymath.h" && cp -f "/usr/include/python3.5m/pymem.h" "$(@D)/python_include/pymem.h" && cp -f "/usr/include/python3.5m/pyport.h" "$(@D)/python_include/pyport.h" && cp -f "/usr/include/python3.5m/pystate.h" "$(@D)/python_include/pystate.h" && cp -f "/usr/include/python3.5m/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp -f "/usr/include/python3.5m/pystrhex.h" "$(@D)/python_include/pystrhex.h" && cp -f "/usr/include/python3.5m/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp -f "/usr/include/python3.5m/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp -f "/usr/include/python3.5m/pythread.h" "$(@D)/python_include/pythread.h" && cp -f "/usr/include/python3.5m/pytime.h" "$(@D)/python_include/pytime.h" && cp -f "/usr/include/python3.5m/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp -f "/usr/include/python3.5m/setobject.h" "$(@D)/python_include/setobject.h" && cp -f "/usr/include/python3.5m/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp -f "/usr/include/python3.5m/structmember.h" "$(@D)/python_include/structmember.h" && cp -f "/usr/include/python3.5m/structseq.h" "$(@D)/python_include/structseq.h" && cp -f "/usr/include/python3.5m/symtable.h" "$(@D)/python_include/symtable.h" && cp -f "/usr/include/python3.5m/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp -f "/usr/include/python3.5m/token.h" "$(@D)/python_include/token.h" && cp -f "/usr/include/python3.5m/traceback.h" "$(@D)/python_include/traceback.h" && cp -f "/usr/include/python3.5m/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp -f "/usr/include/python3.5m/typeslots.h" "$(@D)/python_include/typeslots.h" && cp -f "/usr/include/python3.5m/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp -f "/usr/include/python3.5m/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp -f "/usr/include/python3.5m/warnings.h" "$(@D)/python_include/warnings.h" && cp -f "/usr/include/python3.5m/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
+   """,
+)
+
+genrule(
+    name = "numpy_include",
+    outs = [
+        "numpy_include/numpy/__multiarray_api.h",
+        "numpy_include/numpy/__ufunc_api.h",
+        "numpy_include/numpy/_neighborhood_iterator_imp.h",
+        "numpy_include/numpy/_numpyconfig.h",
+        "numpy_include/numpy/arrayobject.h",
+        "numpy_include/numpy/arrayscalars.h",
+        "numpy_include/numpy/halffloat.h",
+        "numpy_include/numpy/multiarray_api.txt",
+        "numpy_include/numpy/ndarrayobject.h",
+        "numpy_include/numpy/ndarraytypes.h",
+        "numpy_include/numpy/noprefix.h",
+        "numpy_include/numpy/npy_1_7_deprecated_api.h",
+        "numpy_include/numpy/npy_3kcompat.h",
+        "numpy_include/numpy/npy_common.h",
+        "numpy_include/numpy/npy_cpu.h",
+        "numpy_include/numpy/npy_endian.h",
+        "numpy_include/numpy/npy_interrupt.h",
+        "numpy_include/numpy/npy_math.h",
+        "numpy_include/numpy/npy_no_deprecated_api.h",
+        "numpy_include/numpy/npy_os.h",
+        "numpy_include/numpy/numpyconfig.h",
+        "numpy_include/numpy/old_defines.h",
+        "numpy_include/numpy/oldnumeric.h",
+        "numpy_include/numpy/ufunc_api.txt",
+        "numpy_include/numpy/ufuncobject.h",
+        "numpy_include/numpy/utils.h",
+    ],
+    cmd = """
+cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
+   """,
+)
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/py3/WORKSPACE b/third_party/toolchains/preconfig/ubuntu16.04/py3/WORKSPACE
new file mode 100644
index 0000000000000000000000000000000000000000..1d298fefa3bf40b2c02605960d69c5974e9de7b7
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/py3/WORKSPACE
@@ -0,0 +1,2 @@
+# DO NOT EDIT: automatically generated WORKSPACE file for python_configure rule
+workspace(name = "local_config_python")